From 45658ccccba609053f6a27948e066a5de3fd8ab5 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Mon, 2 Dec 2024 10:10:51 +0000
Subject: [PATCH 01/65] Update pgvector to 0.8.0 (#9733)

---
 compute/compute-node.Dockerfile | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 956701805393..222a0cb88b59 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -358,10 +358,10 @@ COPY compute/patches/pgvector.patch /pgvector.patch
 # because we build the images on different machines than where we run them.
 # Pass OPTFLAGS="" to remove it.
 #
-# vector 0.7.4 supports v17
-# last release v0.7.4 - Aug 5, 2024
-RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.7.4.tar.gz -O pgvector.tar.gz && \
-    echo "0341edf89b1924ae0d552f617e14fb7f8867c0194ed775bcc44fa40288642583 pgvector.tar.gz" | sha256sum --check && \
+# vector >0.7.4 supports v17
+# last release v0.8.0 - Oct 30, 2024
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.8.0.tar.gz -O pgvector.tar.gz && \
+    echo "867a2c328d4928a5a9d6f052cd3bc78c7d60228a9b914ad32aa3db88e9de27b0 pgvector.tar.gz" | sha256sum --check && \
     mkdir pgvector-src && cd pgvector-src && tar xzf ../pgvector.tar.gz --strip-components=1 -C . && \
     patch -p1 < /pgvector.patch && \
     make -j $(getconf _NPROCESSORS_ONLN) OPTFLAGS="" PG_CONFIG=/usr/local/pgsql/bin/pg_config && \

From 533012204913edf56fbcfc723e878b92fea5d93c Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Mon, 2 Dec 2024 12:26:15 +0200
Subject: [PATCH 02/65] test_runner: improve `wait_until` (#9936)

Improves `wait_until` by:

* Use `timeout` instead of `iterations`. This allows changing the
timeout/interval parameters independently.
* Make `timeout` and `interval` optional (default 20s and 0.5s). Most
callers don't care.
* Only output status every 1s by default, and add optional
`status_interval` parameter.
* Remove `show_intermediate_error`, this was always emitted anyway.

Most callers have been updated to use the defaults, except where they
had good reason otherwise.
---
 test_runner/fixtures/neon_fixtures.py         | 14 ++--
 test_runner/fixtures/pageserver/utils.py      | 17 +----
 test_runner/fixtures/safekeeper/http.py       |  2 +-
 test_runner/fixtures/safekeeper/utils.py      |  2 +-
 test_runner/fixtures/utils.py                 | 35 +++++-----
 test_runner/logical_repl/test_clickhouse.py   |  6 +-
 test_runner/logical_repl/test_debezium.py     | 12 +---
 .../performance/test_branch_creation.py       |  8 +--
 .../regress/test_attach_tenant_config.py      |  2 -
 test_runner/regress/test_compaction.py        |  2 +-
 .../regress/test_disk_usage_eviction.py       | 10 +--
 test_runner/regress/test_hot_standby.py       |  6 +-
 .../regress/test_layers_from_future.py        |  2 +-
 test_runner/regress/test_logging.py           |  4 +-
 .../regress/test_logical_replication.py       |  6 +-
 test_runner/regress/test_lsn_mapping.py       |  4 +-
 test_runner/regress/test_neon_superuser.py    |  2 +-
 test_runner/regress/test_ondemand_download.py | 16 ++---
 test_runner/regress/test_pageserver_api.py    | 12 +---
 .../regress/test_pageserver_generations.py    | 14 ++--
 .../test_pageserver_getpage_throttle.py       |  7 +-
 .../regress/test_pageserver_layer_rolling.py  | 16 ++---
 .../regress/test_pageserver_restart.py        |  2 +-
 .../regress/test_pageserver_secondary.py      |  6 +-
 test_runner/regress/test_readonly_node.py     |  2 -
 test_runner/regress/test_remote_storage.py    | 42 ++++++-----
 test_runner/regress/test_replica_start.py     |  2 +-
 test_runner/regress/test_sharding.py          | 18 ++---
 .../regress/test_storage_controller.py        | 70 +++++++++----------
 test_runner/regress/test_storage_scrubber.py  |  2 -
 .../regress/test_subscriber_restart.py        |  2 +-
 test_runner/regress/test_tenant_conf.py       |  6 +-
 test_runner/regress/test_tenant_delete.py     | 14 ++--
 test_runner/regress/test_tenant_detach.py     |  8 +--
 test_runner/regress/test_tenant_relocation.py |  6 +-
 test_runner/regress/test_tenant_size.py       |  4 +-
 test_runner/regress/test_tenant_tasks.py      |  2 +-
 test_runner/regress/test_tenants.py           |  2 +-
 .../test_tenants_with_remote_storage.py       | 12 +---
 test_runner/regress/test_timeline_archive.py  |  6 +-
 test_runner/regress/test_timeline_delete.py   | 61 +++++-----------
 .../regress/test_timeline_detach_ancestor.py  | 30 ++++----
 .../regress/test_timeline_gc_blocking.py      |  2 +-
 test_runner/regress/test_timeline_size.py     | 36 +++++-----
 test_runner/regress/test_wal_acceptor.py      | 22 +++---
 test_runner/regress/test_wal_receiver.py      |  4 +-
 46 files changed, 234 insertions(+), 326 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 9bcfffeb9cf5..5709a3b82b96 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1736,7 +1736,7 @@ def wait_until_ready(self):
         def storage_controller_ready():
             assert self.ready() is True
 
-        wait_until(30, 1, storage_controller_ready)
+        wait_until(storage_controller_ready)
         return time.time() - t1
 
     def attach_hook_issue(
@@ -2574,7 +2574,7 @@ def complete():
             log.info(f"any_unstable={any_unstable}")
             assert not any_unstable
 
-        wait_until(20, 0.5, complete)
+        wait_until(complete)
 
     def __enter__(self) -> Self:
         return self
@@ -3973,7 +3973,7 @@ def check_migrations_done():
                 migration_id: int = cur.fetchall()[0][0]
                 assert migration_id >= num_migrations
 
-            wait_until(20, 0.5, check_migrations_done)
+            wait_until(check_migrations_done)
 
     # Mock the extension part of spec passed from control plane for local testing
     # endpooint.rs adds content of this file as a part of the spec.json
@@ -4489,12 +4489,10 @@ def are_lsns_advanced():
             )
             assert stat.remote_consistent_lsn >= lsn and stat.backup_lsn >= lsn.segment_lsn()
 
-        # xxx: max wait is long because we might be waiting for reconnection from
-        # pageserver to this safekeeper
-        wait_until(30, 1, are_lsns_advanced)
+        wait_until(are_lsns_advanced)
         client.checkpoint(tenant_id, timeline_id)
         if wait_wal_removal:
-            wait_until(30, 1, are_segments_removed)
+            wait_until(are_segments_removed)
 
     def wait_until_paused(self, failpoint: str):
         msg = f"at failpoint {failpoint}"
@@ -4503,7 +4501,7 @@ def paused():
             log.info(f"waiting for hitting failpoint {failpoint}")
             self.assert_log_contains(msg)
 
-        wait_until(20, 0.5, paused)
+        wait_until(paused)
 
 
 class NeonBroker(LogUtils):
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 46700e3fe377..7c10edc5fc33 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -13,7 +13,7 @@
 from fixtures.common_types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.log_helper import log
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
-from fixtures.remote_storage import RemoteStorage, RemoteStorageKind, S3Storage
+from fixtures.remote_storage import RemoteStorage, S3Storage
 from fixtures.utils import wait_until
 
 if TYPE_CHECKING:
@@ -269,12 +269,7 @@ def wait_timeline_detail_404(
     pageserver_http: PageserverHttpClient,
     tenant_id: TenantId | TenantShardId,
     timeline_id: TimelineId,
-    iterations: int,
-    interval: float | None = None,
 ):
-    if interval is None:
-        interval = 0.25
-
     def timeline_is_missing():
         data = {}
         try:
@@ -287,19 +282,17 @@ def timeline_is_missing():
 
         raise RuntimeError(f"Timeline exists state {data.get('state')}")
 
-    wait_until(iterations, interval, func=timeline_is_missing)
+    wait_until(timeline_is_missing)
 
 
 def timeline_delete_wait_completed(
     pageserver_http: PageserverHttpClient,
     tenant_id: TenantId | TenantShardId,
     timeline_id: TimelineId,
-    iterations: int = 20,
-    interval: float | None = None,
     **delete_args,
 ) -> None:
     pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args)
-    wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations, interval)
+    wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id)
 
 
 # remote_storage must not be None, but that's easier for callers to make mypy happy
@@ -453,7 +446,3 @@ def many_small_layers_tenant_config() -> dict[str, Any]:
         "checkpoint_distance": 1024**2,
         "image_creation_threshold": 100,
     }
-
-
-def poll_for_remote_storage_iterations(remote_storage_kind: RemoteStorageKind) -> int:
-    return 40 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 15
diff --git a/test_runner/fixtures/safekeeper/http.py b/test_runner/fixtures/safekeeper/http.py
index 094188c0b5f5..286f80ba69f1 100644
--- a/test_runner/fixtures/safekeeper/http.py
+++ b/test_runner/fixtures/safekeeper/http.py
@@ -175,7 +175,7 @@ def timeline_start_lsn_non_zero() -> Lsn:
             assert s > Lsn(0)
             return s
 
-        return wait_until(30, 1, timeline_start_lsn_non_zero)
+        return wait_until(timeline_start_lsn_non_zero)
 
     def get_commit_lsn(self, tenant_id: TenantId, timeline_id: TimelineId) -> Lsn:
         return self.timeline_status(tenant_id, timeline_id).commit_lsn
diff --git a/test_runner/fixtures/safekeeper/utils.py b/test_runner/fixtures/safekeeper/utils.py
index 024691647033..922cdedccc21 100644
--- a/test_runner/fixtures/safekeeper/utils.py
+++ b/test_runner/fixtures/safekeeper/utils.py
@@ -19,4 +19,4 @@ def walreceivers_absent():
         log.info(f"waiting for walreceivers to be gone, currently {status.walreceivers}")
         assert len(status.walreceivers) == 0
 
-    wait_until(30, 0.5, walreceivers_absent)
+    wait_until(walreceivers_absent)
diff --git a/test_runner/fixtures/utils.py b/test_runner/fixtures/utils.py
index 04e98fe494db..c34ac298d1cc 100644
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -9,6 +9,7 @@
 import threading
 import time
 from collections.abc import Callable, Iterable
+from datetime import datetime, timedelta
 from hashlib import sha256
 from pathlib import Path
 from typing import TYPE_CHECKING, Any, TypeVar
@@ -380,15 +381,10 @@ def start_in_background(
             if return_code is not None:
                 error = f"expected subprocess to run but it exited with code {return_code}"
             else:
-                attempts = 10
                 try:
-                    wait_until(
-                        number_of_iterations=attempts,
-                        interval=1,
-                        func=is_started,
-                    )
+                    wait_until(is_started, timeout=10)
                 except Exception:
-                    error = f"Failed to get correct status from subprocess in {attempts} attempts"
+                    error = "Failed to get correct status from subprocess"
         except Exception as e:
             error = f"expected subprocess to start but it failed with exception: {e}"
 
@@ -402,28 +398,31 @@ def start_in_background(
 
 
 def wait_until(
-    number_of_iterations: int,
-    interval: float,
     func: Callable[[], WaitUntilRet],
-    show_intermediate_error: bool = False,
+    name: str | None = None,
+    timeout: float = 20.0,  # seconds
+    interval: float = 0.5,  # seconds
+    status_interval: float = 1.0,  # seconds
 ) -> WaitUntilRet:
     """
     Wait until 'func' returns successfully, without exception. Returns the
     last return value from the function.
     """
+    if name is None:
+        name = getattr(func, "__name__", repr(func))
+    deadline = datetime.now() + timedelta(seconds=timeout)
+    next_status = datetime.now()
     last_exception = None
-    for i in range(number_of_iterations):
+    while datetime.now() <= deadline:
         try:
-            res = func()
+            return func()
         except Exception as e:
-            log.info("waiting for %s iteration %s failed: %s", func, i + 1, e)
+            if datetime.now() >= next_status:
+                log.info("waiting for %s: %s", name, e)
+                next_status = datetime.now() + timedelta(seconds=status_interval)
             last_exception = e
-            if show_intermediate_error:
-                log.info(e)
             time.sleep(interval)
-            continue
-        return res
-    raise Exception(f"timed out while waiting for {func}") from last_exception
+    raise Exception(f"timed out while waiting for {name}") from last_exception
 
 
 def assert_eq(a, b) -> None:
diff --git a/test_runner/logical_repl/test_clickhouse.py b/test_runner/logical_repl/test_clickhouse.py
index 8e03bbe5d4f1..6b522fa46d22 100644
--- a/test_runner/logical_repl/test_clickhouse.py
+++ b/test_runner/logical_repl/test_clickhouse.py
@@ -60,24 +60,22 @@ def test_clickhouse(remote_pg: RemotePostgres):
         "SETTINGS materialized_postgresql_tables_list = 'table1';"
     )
     wait_until(
-        120,
-        0.5,
         lambda: query_clickhouse(
             client,
             "select * from db1_postgres.table1 order by 1",
             "ee600d8f7cd05bd0b169fa81f44300a9dd10085a",
         ),
+        timeout=60,
     )
     cur.execute("INSERT INTO table1 (id, column1) VALUES (3, 'ghi'), (4, 'jkl');")
     conn.commit()
     wait_until(
-        120,
-        0.5,
         lambda: query_clickhouse(
             client,
             "select * from db1_postgres.table1 order by 1",
             "9eba2daaf7e4d7d27ac849525f68b562ab53947d",
         ),
+        timeout=60,
     )
     log.debug("Sleeping before final checking if Neon is still alive")
     time.sleep(3)
diff --git a/test_runner/logical_repl/test_debezium.py b/test_runner/logical_repl/test_debezium.py
index d2cb087c92f5..8023d64d3d73 100644
--- a/test_runner/logical_repl/test_debezium.py
+++ b/test_runner/logical_repl/test_debezium.py
@@ -148,14 +148,12 @@ def test_debezium(debezium):
     )
     conn.commit()
     wait_until(
-        100,
-        0.5,
         lambda: get_kafka_msg(
             consumer,
             ts_ms,
             after={"first_name": "John", "last_name": "Dow", "email": "johndow@example.com"},
         ),
-        show_intermediate_error=True,
+        timeout=60,
     )
     ts_ms = time.time() * 1000
     log.info("Insert 2 ts_ms: %s", ts_ms)
@@ -165,28 +163,24 @@ def test_debezium(debezium):
     )
     conn.commit()
     wait_until(
-        100,
-        0.5,
         lambda: get_kafka_msg(
             consumer,
             ts_ms,
             after={"first_name": "Alex", "last_name": "Row", "email": "alexrow@example.com"},
         ),
-        show_intermediate_error=True,
+        timeout=60,
     )
     ts_ms = time.time() * 1000
     log.info("Update ts_ms: %s", ts_ms)
     cur.execute("update inventory.customers set first_name = 'Alexander' where id = 2")
     conn.commit()
     wait_until(
-        100,
-        0.5,
         lambda: get_kafka_msg(
             consumer,
             ts_ms,
             after={"first_name": "Alexander"},
         ),
-        show_intermediate_error=True,
+        timeout=60,
     )
     time.sleep(3)
     cur.execute("select 1")
diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py
index c50c4ad4324e..3ce27d6cd3eb 100644
--- a/test_runner/performance/test_branch_creation.py
+++ b/test_runner/performance/test_branch_creation.py
@@ -137,7 +137,7 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape:
     startup_line = "INFO version: git(-env)?:"
 
     # find the first line of the log file so we can find the next start later
-    _, first_start = wait_until(5, 1, lambda: env.pageserver.assert_log_contains(startup_line))
+    _, first_start = wait_until(lambda: env.pageserver.assert_log_contains(startup_line))
 
     # start without gc so we can time compaction with less noise; use shorter
     # period for compaction so it starts earlier
@@ -156,7 +156,7 @@ def patch_default_tenant_config(config):
     )
 
     _, second_start = wait_until(
-        5, 1, lambda: env.pageserver.assert_log_contains(startup_line, first_start)
+        lambda: env.pageserver.assert_log_contains(startup_line, first_start),
     )
     env.pageserver.quiesce_tenants()
 
@@ -164,8 +164,6 @@ def patch_default_tenant_config(config):
 
     # wait for compaction to complete, which most likely has already done so multiple times
     msg, _ = wait_until(
-        30,
-        1,
         lambda: env.pageserver.assert_log_contains(
             f".*tenant_id={env.initial_tenant}.*: compaction iteration complete.*", second_start
         ),
@@ -205,7 +203,7 @@ def metrics_are_filled() -> list[Sample]:
         assert len(matching) == len(expected_labels)
         return matching
 
-    samples = wait_until(10, 1, metrics_are_filled)
+    samples = wait_until(metrics_are_filled)
 
     for sample in samples:
         phase = sample.labels["phase"]
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 670c2698f5aa..45112fd67e2a 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -64,8 +64,6 @@ def negative_env(neon_env_builder: NeonEnvBuilder) -> Generator[NegativeTests, N
     )
 
     wait_until(
-        50,
-        0.1,
         lambda: env.pageserver.assert_log_contains(".*Error processing HTTP request: Bad request"),
     )
 
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index 302a8fd0d1b4..b6741aed68cb 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -385,7 +385,7 @@ def assert_broken():
 
     # Wait for enough failures to break the circuit breaker
     # This wait is fairly long because we back off on compaction failures, so 5 retries takes ~30s
-    wait_until(60, 1, assert_broken)
+    wait_until(assert_broken, timeout=60)
 
     # Sleep for a while, during which time we expect that compaction will _not_ be retried
     time.sleep(10)
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 18075110082b..05956b5b9378 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -211,7 +211,7 @@ def statvfs_called():
             pageserver.assert_log_contains(".*running mocked statvfs.*")
 
         # we most likely have already completed multiple runs
-        wait_until(10, 1, statvfs_called)
+        wait_until(statvfs_called)
 
 
 def count_layers_per_tenant(
@@ -772,14 +772,14 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv):
     )
 
     wait_until(
-        10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved")
+        lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved")
     )
 
     def less_than_max_usage_pct():
         post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
         assert post_eviction_total_size < 0.33 * total_size, "we requested max 33% usage"
 
-    wait_until(2, 2, less_than_max_usage_pct)
+    wait_until(less_than_max_usage_pct, timeout=5)
 
     # Disk usage candidate collection only takes into account active tenants.
     # However, the statvfs call takes into account the entire tenants directory,
@@ -825,7 +825,7 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
     )
 
     wait_until(
-        10, 1, lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved")
+        lambda: env.neon_env.pageserver.assert_log_contains(".*disk usage pressure relieved"),
     )
 
     def more_than_min_avail_bytes_freed():
@@ -834,7 +834,7 @@ def more_than_min_avail_bytes_freed():
             total_size - post_eviction_total_size >= min_avail_bytes
         ), f"we requested at least {min_avail_bytes} worth of free space"
 
-    wait_until(2, 2, more_than_min_avail_bytes_freed)
+    wait_until(more_than_min_avail_bytes_freed, timeout=5)
 
 
 def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv):
diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index 0b1ac11c1653..4044f25b37b8 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -257,7 +257,7 @@ def test_hot_standby_feedback(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
             # Wait until we see that the pgbench_accounts is created + filled on replica *and*
             # index is created. Otherwise index creation would conflict with
             # read queries and hs feedback won't save us.
-            wait_until(60, 1.0, partial(pgbench_accounts_initialized, secondary))
+            wait_until(partial(pgbench_accounts_initialized, secondary), timeout=60)
 
             # Test should fail if hs feedback is disabled anyway, but cross
             # check that walproposer sets some xmin.
@@ -269,7 +269,7 @@ def xmin_is_not_null():
                 log.info(f"xmin is {slot_xmin}")
                 assert int(slot_xmin) > 0
 
-            wait_until(10, 1.0, xmin_is_not_null)
+            wait_until(xmin_is_not_null)
             for _ in range(1, 5):
                 # in debug mode takes about 5-7s
                 balance = secondary.safe_psql_scalar("select sum(abalance) from pgbench_accounts")
@@ -286,7 +286,7 @@ def xmin_is_null():
             log.info(f"xmin is {slot_xmin}")
             assert slot_xmin is None
 
-        wait_until(10, 1.0, xmin_is_null)
+        wait_until(xmin_is_null)
 
 
 # Test race condition between WAL replay and backends performing queries
diff --git a/test_runner/regress/test_layers_from_future.py b/test_runner/regress/test_layers_from_future.py
index 761ec7568f4b..8818b407122b 100644
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -206,7 +206,7 @@ def future_layer_is_gone_from_index_part():
         future_layers = set(get_future_layers())
         assert future_layer not in future_layers
 
-    wait_until(10, 0.5, future_layer_is_gone_from_index_part)
+    wait_until(future_layer_is_gone_from_index_part)
 
     # We already make deletion stuck here, but we don't necessarily hit the failpoint
     # because deletions are batched.
diff --git a/test_runner/regress/test_logging.py b/test_runner/regress/test_logging.py
index f6fbdcabfd9d..d94c786f4983 100644
--- a/test_runner/regress/test_logging.py
+++ b/test_runner/regress/test_logging.py
@@ -37,7 +37,7 @@ def assert_logged():
             return
         env.pageserver.assert_log_contains(f".*{msg_id}.*")
 
-    wait_until(10, 0.5, assert_logged)
+    wait_until(assert_logged)
 
     # make sure it's counted
     def assert_metric_value():
@@ -49,4 +49,4 @@ def assert_metric_value():
         log.info("libmetrics_tracing_event_count: %s", val)
         assert val > (before or 0.0)
 
-    wait_until(10, 1, assert_metric_value)
+    wait_until(assert_metric_value)
diff --git a/test_runner/regress/test_logical_replication.py b/test_runner/regress/test_logical_replication.py
index ba471b7147d7..db18e1758c12 100644
--- a/test_runner/regress/test_logical_replication.py
+++ b/test_runner/regress/test_logical_replication.py
@@ -207,7 +207,7 @@ def slot_removed(ep: Endpoint):
     log.info(f"ep connstr is {endpoint.connstr()}, subscriber connstr {vanilla_pg.connstr()}")
     vanilla_pg.safe_psql(f"create subscription sub1 connection '{connstr}' publication pub1")
 
-    wait_until(number_of_iterations=10, interval=2, func=partial(slot_removed, endpoint))
+    wait_until(partial(slot_removed, endpoint))
 
 
 def test_ondemand_wal_download_in_replication_slot_funcs(neon_env_builder: NeonEnvBuilder):
@@ -519,7 +519,7 @@ def check_that_changes_propagated():
             assert len(res) == 4
             assert [r[0] for r in res] == [10, 20, 30, 40]
 
-        wait_until(10, 0.5, check_that_changes_propagated)
+        wait_until(check_that_changes_propagated)
 
 
 def logical_replication_wait_flush_lsn_sync(publisher: PgProtocol) -> Lsn:
@@ -549,7 +549,7 @@ def check_caughtup():
         )
         assert flush_lsn >= publisher_flush_lsn
 
-    wait_until(30, 0.5, check_caughtup)
+    wait_until(check_caughtup)
     return publisher_flush_lsn
 
 
diff --git a/test_runner/regress/test_lsn_mapping.py b/test_runner/regress/test_lsn_mapping.py
index 7f0b54112820..e42e71646d97 100644
--- a/test_runner/regress/test_lsn_mapping.py
+++ b/test_runner/regress/test_lsn_mapping.py
@@ -169,7 +169,7 @@ def test_get_lsn_by_timestamp_cancelled(neon_env_builder: NeonEnvBuilder):
         )
 
         _, offset = wait_until(
-            20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
+            lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
         )
 
         with pytest.raises(ReadTimeout):
@@ -178,8 +178,6 @@ def test_get_lsn_by_timestamp_cancelled(neon_env_builder: NeonEnvBuilder):
         client.configure_failpoints((failpoint, "off"))
 
         _, offset = wait_until(
-            20,
-            0.5,
             lambda: env.pageserver.assert_log_contains(
                 "Cancelled request finished with an error: Cancelled$", offset
             ),
diff --git a/test_runner/regress/test_neon_superuser.py b/test_runner/regress/test_neon_superuser.py
index 7118127a1ffe..49cd91906f8f 100644
--- a/test_runner/regress/test_neon_superuser.py
+++ b/test_runner/regress/test_neon_superuser.py
@@ -77,7 +77,7 @@ def check_that_changes_propagated():
             assert len(res) == 4
             assert [r[0] for r in res] == [10, 20, 30, 40]
 
-        wait_until(10, 0.5, check_that_changes_propagated)
+        wait_until(check_that_changes_propagated)
 
         # Test that pg_monitor is working for neon_superuser role
         cur.execute("SELECT query from pg_stat_activity LIMIT 1")
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index e1caaeb6c1f6..028d1c2e49b8 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -256,7 +256,7 @@ def get_resident_physical_size():
     ##### Second start, restore the data and ensure it's the same
     env.pageserver.start()
 
-    wait_until(10, 0.2, lambda: assert_tenant_state(client, tenant_id, "Active"))
+    wait_until(lambda: assert_tenant_state(client, tenant_id, "Active"))
 
     # The current_physical_size reports the sum of layers loaded in the layer
     # map, regardless of where the layer files are located. So even though we
@@ -413,7 +413,7 @@ def get_resident_physical_size():
         ]
     )
 
-    wait_until(10, 0.2, lambda: assert_tenant_state(client, tenant_id, "Active"))
+    wait_until(lambda: assert_tenant_state(client, tenant_id, "Active"))
 
     ###### Phase 1: exercise download error code path
 
@@ -705,7 +705,7 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu
         )
 
         _, offset = wait_until(
-            20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
+            lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
         )
 
         location_conf = {"mode": "Detached", "tenant_conf": {}}
@@ -713,8 +713,6 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu
         detach = exec.submit(client.tenant_location_conf, env.initial_tenant, location_conf)
 
         _, offset = wait_until(
-            20,
-            0.5,
             lambda: env.pageserver.assert_log_contains(
                 "closing is taking longer than expected", offset
             ),
@@ -734,8 +732,6 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu
         client.configure_failpoints((failpoint, "pause"))
 
         _, offset = wait_until(
-            20,
-            0.5,
             lambda: env.pageserver.assert_log_contains(f"cfg failpoint: {failpoint} pause", offset),
         )
 
@@ -750,8 +746,6 @@ def test_layer_download_cancelled_by_config_location(neon_env_builder: NeonEnvBu
         warmup = exec.submit(client.tenant_secondary_download, env.initial_tenant, wait_ms=30000)
 
         _, offset = wait_until(
-            20,
-            0.5,
             lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}", offset),
         )
 
@@ -805,7 +799,7 @@ def test_layer_download_timeouted(neon_env_builder: NeonEnvBuilder):
         )
 
         _, offset = wait_until(
-            20, 0.5, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
+            lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
         )
         # ensure enough time while paused to trip the timeout
         time.sleep(2)
@@ -824,8 +818,6 @@ def test_layer_download_timeouted(neon_env_builder: NeonEnvBuilder):
 
         # capture the next offset for a new synchronization with the failpoint
         _, offset = wait_until(
-            20,
-            0.5,
             lambda: env.pageserver.assert_log_contains(f"cfg failpoint: {failpoint} pause", offset),
         )
 
diff --git a/test_runner/regress/test_pageserver_api.py b/test_runner/regress/test_pageserver_api.py
index 05e81b82e07a..55fd7a8608b4 100644
--- a/test_runner/regress/test_pageserver_api.py
+++ b/test_runner/regress/test_pageserver_api.py
@@ -117,19 +117,11 @@ def test_pageserver_http_get_wal_receiver_success(neon_simple_env: NeonEnv):
         # We need to wait here because it's possible that we don't have access to
         # the latest WAL yet, when the `timeline_detail` API is first called.
         # See: https://github.com/neondatabase/neon/issues/1768.
-        lsn = wait_until(
-            number_of_iterations=5,
-            interval=1,
-            func=lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, None),
-        )
+        lsn = wait_until(lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, None))
 
         # Make a DB modification then expect getting a new WAL receiver's data.
         endpoint.safe_psql("INSERT INTO t VALUES (1, 'hey')")
-        wait_until(
-            number_of_iterations=5,
-            interval=1,
-            func=lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, lsn),
-        )
+        wait_until(lambda: expect_updated_msg_lsn(client, tenant_id, timeline_id, lsn))
 
 
 def test_pageserver_http_api_client(neon_simple_env: NeonEnv):
diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index 6ba5753420c7..7e5bb45242ff 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -352,7 +352,7 @@ def test_deletion_queue_recovery(
         def assert_some_validations():
             assert get_deletion_queue_validated(ps_http) > 0
 
-        wait_until(20, 1, assert_some_validations)
+        wait_until(assert_some_validations)
 
         # The validatated keys statistic advances before the header is written, so we
         # also wait to see the header hit the disk: this seems paranoid but the race
@@ -360,7 +360,7 @@ def assert_some_validations():
         def assert_header_written():
             assert (main_pageserver.workdir / "deletion" / "header-01").exists()
 
-        wait_until(20, 1, assert_header_written)
+        wait_until(assert_header_written)
 
         # If we will lose attachment, then our expectation on restart is that only the ones
         # we already validated will execute.  Act like only those were present in the queue.
@@ -382,11 +382,11 @@ def assert_deletions_submitted(n: int) -> None:
     # After restart, issue a flush to kick the deletion frontend to do recovery.
     # It should recover all the operations we submitted before the restart.
     ps_http.deletion_queue_flush(execute=False)
-    wait_until(20, 0.25, lambda: assert_deletions_submitted(before_restart_depth))
+    wait_until(lambda: assert_deletions_submitted(before_restart_depth))
 
     # The queue should drain through completely if we flush it
     ps_http.deletion_queue_flush(execute=True)
-    wait_until(10, 1, lambda: assert_deletion_queue(ps_http, lambda n: n == 0))
+    wait_until(lambda: assert_deletion_queue(ps_http, lambda n: n == 0))
 
     if keep_attachment == KeepAttachment.KEEP:
         # - If we kept the attachment, then our pre-restart deletions should execute
@@ -564,7 +564,7 @@ def test_multi_attach(
     )
 
     # Initially, the tenant will be attached to the first pageserver (first is default in our test harness)
-    wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[0], tenant_id, "Active"))
+    wait_until(lambda: assert_tenant_state(http_clients[0], tenant_id, "Active"))
     _detail = http_clients[0].timeline_detail(tenant_id, timeline_id)
     with pytest.raises(PageserverApiException):
         http_clients[1].timeline_detail(tenant_id, timeline_id)
@@ -579,8 +579,8 @@ def test_multi_attach(
     pageservers[1].tenant_attach(env.initial_tenant)
     pageservers[2].tenant_attach(env.initial_tenant)
 
-    wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[1], tenant_id, "Active"))
-    wait_until(10, 0.2, lambda: assert_tenant_state(http_clients[2], tenant_id, "Active"))
+    wait_until(lambda: assert_tenant_state(http_clients[1], tenant_id, "Active"))
+    wait_until(lambda: assert_tenant_state(http_clients[2], tenant_id, "Active"))
 
     # Now they all have it attached
     _details = list([c.timeline_detail(tenant_id, timeline_id) for c in http_clients])
diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py
index f1aad85fe98a..ba6a1d90451e 100644
--- a/test_runner/regress/test_pageserver_getpage_throttle.py
+++ b/test_runner/regress/test_pageserver_getpage_throttle.py
@@ -81,9 +81,7 @@ def run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs: i
 
     marker = uuid.uuid4().hex
     ps_http.post_tracing_event("info", marker)
-    _, marker_offset = wait_until(
-        10, 0.5, lambda: env.pageserver.assert_log_contains(marker, offset=None)
-    )
+    _, marker_offset = wait_until(lambda: env.pageserver.assert_log_contains(marker, offset=None))
 
     log.info("run pagebench")
     duration_secs = 10
@@ -103,12 +101,11 @@ def run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs: i
     log.info("validate that we logged the throttling")
 
     wait_until(
-        10,
-        compaction_period / 10,
         lambda: env.pageserver.assert_log_contains(
             f".*{tenant_id}.*shard was throttled in the last n_seconds.*",
             offset=marker_offset,
         ),
+        timeout=compaction_period,
     )
 
     log.info("validate that the metric doesn't include throttle wait time")
diff --git a/test_runner/regress/test_pageserver_layer_rolling.py b/test_runner/regress/test_pageserver_layer_rolling.py
index f6a7bfa1ade5..706da1e35e00 100644
--- a/test_runner/regress/test_pageserver_layer_rolling.py
+++ b/test_runner/regress/test_pageserver_layer_rolling.py
@@ -84,7 +84,7 @@ def query():
     # The metric gets initialised on the first update.
     # Retry a few times, but return 0 if it's stable.
     try:
-        return float(wait_until(3, 0.5, query))
+        return float(wait_until(query, timeout=2, interval=0.5))
     except Exception:
         return 0
 
@@ -131,7 +131,7 @@ def test_pageserver_small_inmemory_layers(
     wait_until_pageserver_is_caught_up(env, last_flush_lsns)
 
     # We didn't write enough data to trigger a size-based checkpoint: we should see dirty data.
-    wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env))
+    wait_until(lambda: assert_dirty_bytes_nonzero(env))
 
     ps_http_client = env.pageserver.http_client()
     total_wal_ingested_before_restart = wait_for_wal_ingest_metric(ps_http_client)
@@ -139,7 +139,7 @@ def test_pageserver_small_inmemory_layers(
     # Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed,
     # such that there are zero bytes of ephemeral layer left on the pageserver
     log.info("Waiting for background checkpoints...")
-    wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0))
+    wait_until(lambda: assert_dirty_bytes(env, 0), timeout=2 * CHECKPOINT_TIMEOUT_SECONDS)
 
     # Zero ephemeral layer bytes does not imply that all the frozen layers were uploaded: they
     # must be uploaded to remain visible to the pageserver after restart.
@@ -180,7 +180,7 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder):
     wait_until_pageserver_is_caught_up(env, last_flush_lsns)
 
     # We didn't write enough data to trigger a size-based checkpoint: we should see dirty data.
-    wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env))
+    wait_until(lambda: assert_dirty_bytes_nonzero(env))
 
     # Stop the safekeepers, so that we cannot have any more WAL receiver connections
     for sk in env.safekeepers:
@@ -193,7 +193,7 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder):
     # Within ~ the checkpoint interval, all the ephemeral layers should be frozen and flushed,
     # such that there are zero bytes of ephemeral layer left on the pageserver
     log.info("Waiting for background checkpoints...")
-    wait_until(CHECKPOINT_TIMEOUT_SECONDS * 2, 1, lambda: assert_dirty_bytes(env, 0))
+    wait_until(lambda: assert_dirty_bytes(env, 0), timeout=2 * CHECKPOINT_TIMEOUT_SECONDS)
 
     # The code below verifies that we do not flush on the first write
     # after an idle period longer than the checkpoint timeout.
@@ -210,7 +210,7 @@ def test_idle_checkpoints(neon_env_builder: NeonEnvBuilder):
         run_worker_for_tenant(env, 5, tenant_with_extra_writes, offset=ENTRIES_PER_TIMELINE)
     )
 
-    dirty_after_write = wait_until(10, 1, lambda: assert_dirty_bytes_nonzero(env))
+    dirty_after_write = wait_until(lambda: assert_dirty_bytes_nonzero(env))
 
     # We shouldn't flush since we've just opened a new layer
     waited_for = 0
@@ -305,11 +305,11 @@ def assert_bytes_rolled():
     # Wait until enough layers have rolled that the amount of dirty data is under the threshold.
     # We do this indirectly via layer maps, rather than the dirty bytes metric, to avoid false-passing
     # if that metric isn't updated quickly enough to reflect the dirty bytes exceeding the limit.
-    wait_until(compaction_period_s * 2, 1, assert_bytes_rolled)
+    wait_until(assert_bytes_rolled, timeout=2 * compaction_period_s)
 
     # The end state should also have the reported metric under the limit
     def assert_dirty_data_limited():
         dirty_bytes = get_dirty_bytes(env)
         assert dirty_bytes < max_dirty_data
 
-    wait_until(compaction_period_s * 2, 1, lambda: assert_dirty_data_limited())
+    wait_until(lambda: assert_dirty_data_limited(), timeout=2 * compaction_period_s)
diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py
index 4bf570551731..835ccbd5d430 100644
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -103,7 +103,7 @@ def assert_complete():
 
         raise AssertionError("No 'complete' metric yet")
 
-    wait_until(30, 1.0, assert_complete)
+    wait_until(assert_complete)
 
     # Expectation callbacks: arg t is sample value, arg p is the previous phase's sample value
     expectations = [
diff --git a/test_runner/regress/test_pageserver_secondary.py b/test_runner/regress/test_pageserver_secondary.py
index a264f4d3c9c2..1292682f9e3d 100644
--- a/test_runner/regress/test_pageserver_secondary.py
+++ b/test_runner/regress/test_pageserver_secondary.py
@@ -356,7 +356,7 @@ def caught_up():
         )
         assert destination_lsn >= origin_lsn
 
-    wait_until(100, 0.1, caught_up)
+    wait_until(caught_up)
 
     # The destination should accept writes
     workload.churn_rows(64, pageserver_b.id)
@@ -411,7 +411,7 @@ def blocked_deletions_drained():
         assert submitted is not None
         assert submitted > 0
 
-    wait_until(10, 0.1, blocked_deletions_drained)
+    wait_until(blocked_deletions_drained)
 
     workload.churn_rows(64, pageserver_b.id)
     workload.validate(pageserver_b.id)
@@ -702,7 +702,7 @@ def await_log(pageserver, deadline, expression):
         else:
             timeout = int(deadline - now) + 1
             try:
-                wait_until(timeout, 1, lambda: pageserver.assert_log_contains(expression))
+                wait_until(lambda: pageserver.assert_log_contains(expression), timeout=timeout)
             except:
                 log.error(f"Timed out waiting for '{expression}'")
                 raise
diff --git a/test_runner/regress/test_readonly_node.py b/test_runner/regress/test_readonly_node.py
index 70d558ac5af0..c13bea7ee178 100644
--- a/test_runner/regress/test_readonly_node.py
+++ b/test_runner/regress/test_readonly_node.py
@@ -215,8 +215,6 @@ def trigger_gc_and_select(
 
         # wait for lease renewal before running query.
         _, offset = wait_until(
-            20,
-            0.5,
             lambda: ep_static.assert_log_contains(
                 "lsn_lease_bg_task.*Request succeeded", offset=offset
             ),
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 137e75f78446..76a42ef4a2a2 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -300,9 +300,9 @@ def get_queued_count(file_kind, op_kind):
     print_gc_result(gc_result)
     assert gc_result["layers_removed"] > 0
 
-    wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0))
-    wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0))
-    wait_until(2, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
+    wait_until(lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0))
+    wait_until(lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0))
+    wait_until(lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
 
     # let all future operations queue up
     configure_storage_sync_failpoints("return")
@@ -333,16 +333,28 @@ def churn_while_failpoints_active(result):
     # wait for churn thread's data to get stuck in the upload queue
     # Exponential back-off in upload queue, so, gracious timeouts.
 
-    wait_until(30, 1, lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0))
-    wait_until(30, 1, lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 1))
-    wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
+    wait_until(
+        lambda: assert_gt(get_queued_count(file_kind="layer", op_kind="upload"), 0), timeout=30
+    )
+    wait_until(
+        lambda: assert_ge(get_queued_count(file_kind="index", op_kind="upload"), 1), timeout=30
+    )
+    wait_until(
+        lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0), timeout=30
+    )
 
     # unblock churn operations
     configure_storage_sync_failpoints("off")
 
-    wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0))
-    wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0))
-    wait_until(30, 1, lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0))
+    wait_until(
+        lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="upload"), 0), timeout=30
+    )
+    wait_until(
+        lambda: assert_eq(get_queued_count(file_kind="index", op_kind="upload"), 0), timeout=30
+    )
+    wait_until(
+        lambda: assert_eq(get_queued_count(file_kind="layer", op_kind="delete"), 0), timeout=30
+    )
 
     # The churn thread doesn't make progress once it blocks on the first wait_completion() call,
     # so, give it some time to wrap up.
@@ -580,7 +592,7 @@ def assert_compacted_and_uploads_queued():
             > 0
         )
 
-    wait_until(200, 0.1, assert_compacted_and_uploads_queued)
+    wait_until(assert_compacted_and_uploads_queued)
 
     # Regardless, give checkpoint some time to block for good.
     # Not strictly necessary, but might help uncover failure modes in the future.
@@ -598,9 +610,7 @@ def assert_compacted_and_uploads_queued():
         ]
     )
 
-    # Generous timeout, because currently deletions can get blocked waiting for compaction
-    # This can be reduced when https://github.com/neondatabase/neon/issues/4998 is fixed.
-    timeline_delete_wait_completed(client, tenant_id, timeline_id, iterations=30, interval=1)
+    timeline_delete_wait_completed(client, tenant_id, timeline_id)
 
     assert not timeline_path.exists()
 
@@ -826,22 +836,16 @@ def wait_upload_queue_empty(
     client: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
 ):
     wait_until(
-        2,
-        1,
         lambda: assert_eq(
             get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="upload"), 0
         ),
     )
     wait_until(
-        2,
-        1,
         lambda: assert_eq(
             get_queued_count(client, tenant_id, timeline_id, file_kind="index", op_kind="upload"), 0
         ),
     )
     wait_until(
-        2,
-        1,
         lambda: assert_eq(
             get_queued_count(client, tenant_id, timeline_id, file_kind="layer", op_kind="delete"), 0
         ),
diff --git a/test_runner/regress/test_replica_start.py b/test_runner/regress/test_replica_start.py
index 8e7c01f95029..e2a22cc769f7 100644
--- a/test_runner/regress/test_replica_start.py
+++ b/test_runner/regress/test_replica_start.py
@@ -378,7 +378,7 @@ def check_replica_crashed():
             return None
         raise RuntimeError("connection succeeded")
 
-    wait_until(20, 0.5, check_replica_crashed)
+    wait_until(check_replica_crashed)
     assert secondary.log_contains("too many KnownAssignedXids")
 
     # Replica is crashed, so ignore stop result
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index 411574bd8621..c86ba0d4ea65 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -836,7 +836,7 @@ def assert_restart_notification():
         assert len(notifications) == 3
         assert notifications[2] == expect_after
 
-    wait_until(10, 1, assert_restart_notification)
+    wait_until(assert_restart_notification)
 
 
 # The quantity of data isn't huge, but debug can be _very_ slow, and the things we're
@@ -1025,7 +1025,7 @@ def assert_all_disk_consistent():
             assert Lsn(timeline_detail["disk_consistent_lsn"]) >= expect_lsn
 
     # We set a short checkpoint timeout: expect things to get frozen+flushed within that
-    wait_until(checkpoint_interval_secs * 3, 1, assert_all_disk_consistent)
+    wait_until(assert_all_disk_consistent, timeout=3 * checkpoint_interval_secs)
 
     def assert_all_remote_consistent():
         """
@@ -1037,7 +1037,7 @@ def assert_all_remote_consistent():
             assert Lsn(timeline_detail["remote_consistent_lsn"]) >= expect_lsn
 
     # We set a short checkpoint timeout: expect things to get frozen+flushed within that
-    wait_until(checkpoint_interval_secs * 3, 1, assert_all_remote_consistent)
+    wait_until(assert_all_remote_consistent, timeout=3 * checkpoint_interval_secs)
 
     workload.validate()
 
@@ -1405,14 +1405,14 @@ def finish_split():
         #   e.g. while waiting for a storage controller to re-attach a parent shard if we failed
         #   inside the pageserver and the storage controller responds by detaching children and attaching
         #   parents concurrently (https://github.com/neondatabase/neon/issues/7148)
-        wait_until(10, 1, lambda: workload.churn_rows(10, upload=False, ingest=False))
+        wait_until(lambda: workload.churn_rows(10, upload=False, ingest=False))
 
         workload.validate()
 
     if failure.fails_forward(env):
         log.info("Fail-forward failure, checking split eventually completes...")
         # A failure type which results in eventual completion of the split
-        wait_until(30, 1, assert_split_done)
+        wait_until(assert_split_done)
     elif failure.can_mitigate():
         log.info("Mitigating failure...")
         # Mitigation phase: we expect to be able to proceed with a successful shard split
@@ -1420,21 +1420,21 @@ def finish_split():
 
         # The split should appear to be rolled back from the point of view of all pageservers
         # apart from the one that is offline
-        wait_until(30, 1, lambda: assert_rolled_back(exclude_ps_id=failure.pageserver_id))
+        wait_until(lambda: assert_rolled_back(exclude_ps_id=failure.pageserver_id))
 
         finish_split()
-        wait_until(30, 1, lambda: assert_split_done(exclude_ps_id=failure.pageserver_id))
+        wait_until(lambda: assert_split_done(exclude_ps_id=failure.pageserver_id))
 
         # Having cleared the failure, everything should converge to a pristine state
         failure.clear(env)
-        wait_until(30, 1, assert_split_done)
+        wait_until(assert_split_done)
     else:
         # Once we restore the faulty pageserver's API to good health, rollback should
         # eventually complete.
         log.info("Clearing failure...")
         failure.clear(env)
 
-        wait_until(30, 1, assert_rolled_back)
+        wait_until(assert_rolled_back)
 
         # Having rolled back, the tenant should be working
         workload.churn_rows(10)
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 13bc54a1146d..e93e251b4fa7 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -154,7 +154,7 @@ def node_evacuated(node_id: int) -> None:
         counts = get_node_shard_counts(env, tenant_ids)
         assert counts[node_id] == 0
 
-    wait_until(10, 1, lambda: node_evacuated(env.pageservers[0].id))
+    wait_until(lambda: node_evacuated(env.pageservers[0].id))
 
     # Let all the reconciliations after marking the node offline complete
     env.storage_controller.reconcile_until_idle()
@@ -222,7 +222,7 @@ def test_node_status_after_restart(
     def is_ready():
         assert env.storage_controller.ready() is True
 
-    wait_until(30, 1, is_ready)
+    wait_until(is_ready)
 
     # We loaded nodes from database on restart
     nodes = env.storage_controller.node_list()
@@ -606,7 +606,7 @@ def node_evacuated(node_id: int) -> None:
         counts = get_node_shard_counts(env, [env.initial_tenant])
         assert counts[node_id] == 0
 
-    wait_until(10, 1, lambda: node_evacuated(env.pageservers[0].id))
+    wait_until(lambda: node_evacuated(env.pageservers[0].id))
 
     # Additional notification from migration
     log.info(f"notifications: {notifications}")
@@ -620,7 +620,7 @@ def received_migration_notification():
         assert len(notifications) == 2
         assert notifications[1] == expect
 
-    wait_until(20, 0.25, received_migration_notification)
+    wait_until(received_migration_notification)
 
     # When we restart, we should re-emit notifications for all tenants
     env.storage_controller.stop()
@@ -630,7 +630,7 @@ def received_restart_notification():
         assert len(notifications) == 3
         assert notifications[2] == expect
 
-    wait_until(10, 1, received_restart_notification)
+    wait_until(received_restart_notification)
 
     # Splitting a tenant should cause its stripe size to become visible in the compute notification
     env.storage_controller.tenant_shard_split(env.initial_tenant, shard_count=2)
@@ -647,7 +647,7 @@ def received_split_notification():
         assert len(notifications) == 4
         assert notifications[3] == expect
 
-    wait_until(10, 1, received_split_notification)
+    wait_until(received_split_notification)
 
     # If the compute hook is unavailable, that should not block creating a tenant and
     # creating a timeline.  This simulates a control plane refusing to accept notifications
@@ -736,7 +736,7 @@ def handler(request: Request):
         def logged_stuck():
             env.storage_controller.assert_log_contains(NOTIFY_BLOCKED_LOG)
 
-        wait_until(10, 0.25, logged_stuck)
+        wait_until(logged_stuck)
         contains_r = env.storage_controller.log_contains(NOTIFY_BLOCKED_LOG)
         assert contains_r is not None  # Appease mypy
         (_, log_cursor) = contains_r
@@ -764,7 +764,7 @@ def logged_stuck():
         def logged_stuck_again():
             env.storage_controller.assert_log_contains(NOTIFY_BLOCKED_LOG, offset=log_cursor)
 
-        wait_until(10, 0.25, logged_stuck_again)
+        wait_until(logged_stuck_again)
         assert migrate_fut.running()
 
         # This time, the compute hook remains stuck, but we mark the origin node offline: this should
@@ -865,7 +865,7 @@ def notified_ps(ps_id: int) -> None:
         assert latest["shards"] is not None
         assert latest["shards"][0]["node_id"] == ps_id
 
-    wait_until(30, 1, lambda: notified_ps(pageserver_a.id))
+    wait_until(lambda: notified_ps(pageserver_a.id))
 
     env.storage_controller.allowed_errors.append(NOTIFY_BLOCKED_LOG)
     env.storage_controller.allowed_errors.extend(NOTIFY_FAILURE_LOGS)
@@ -880,7 +880,7 @@ def notified_ps(ps_id: int) -> None:
 
     # Although the migration API failed, the hook should still see pageserver B (it remembers what
     # was posted even when returning an error code)
-    wait_until(30, 1, lambda: notified_ps(pageserver_b.id))
+    wait_until(lambda: notified_ps(pageserver_b.id))
 
     # Although the migration API failed, the tenant should still have moved to the right pageserver
     assert len(pageserver_b.http_client().tenant_list()) == 1
@@ -898,7 +898,7 @@ def notified_ps(ps_id: int) -> None:
     def logged_giving_up():
         env.storage_controller.assert_log_contains(".*Giving up on compute notification.*")
 
-    wait_until(30, 1, logged_giving_up)
+    wait_until(logged_giving_up)
 
     pageserver_a.start()
 
@@ -919,7 +919,7 @@ def logged_giving_up():
     handle_params["status"] = 200
     env.storage_controller.tenant_shard_migrate(tenant_shard_id, pageserver_a.id)
 
-    wait_until(30, 1, lambda: notified_ps(pageserver_a.id))
+    wait_until(lambda: notified_ps(pageserver_a.id))
 
 
 def test_storage_controller_debug_apis(neon_env_builder: NeonEnvBuilder):
@@ -1453,7 +1453,7 @@ def tenants_placed():
         # Check that each node got one tenant
         assert all(len(ts) == 1 for ts in node_to_tenants.values())
 
-    wait_until(10, 1, tenants_placed)
+    wait_until(tenants_placed)
 
     # ... then we apply the failure
     offline_node_ids = set(failure.nodes())
@@ -1476,7 +1476,7 @@ def nodes_offline():
                 assert node["availability"] == "Offline"
 
     start = time.time()
-    wait_until(failure.offline_timeout, 1, nodes_offline)
+    wait_until(nodes_offline, timeout=failure.offline_timeout)
     detected_after = time.time() - start
     log.info(f"Detected node failures after {detected_after}s")
 
@@ -1497,7 +1497,7 @@ def tenant_migrated():
 
         assert observed_tenants == set(tenant_ids)
 
-    wait_until(10, 1, tenant_migrated)
+    wait_until(tenant_migrated)
 
     # ... then we clear the failure
     failure.clear(env)
@@ -1509,7 +1509,7 @@ def nodes_online():
             if node["id"] in online_node_ids:
                 assert node["availability"] == "Active"
 
-    wait_until(10, 1, nodes_online)
+    wait_until(nodes_online)
 
     time.sleep(5)
 
@@ -1562,7 +1562,7 @@ def failed_over():
     # We could pre-empty this by configuring the node to Offline, but it's preferable to test
     # the realistic path we would take when a node restarts uncleanly.
     # The delay here will be ~NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL in neon_local
-    wait_until(30, 1, failed_over)
+    wait_until(failed_over)
 
     reconciles_before_restart = env.storage_controller.get_metric_value(
         "storage_controller_reconcile_complete_total", filter={"status": "ok"}
@@ -1640,12 +1640,12 @@ def assert_errors_gt(n) -> int:
         assert e > n
         return e
 
-    errs = wait_until(10, 1, lambda: assert_errors_gt(0))
+    errs = wait_until(lambda: assert_errors_gt(0))
 
     # Try reconciling again, it should fail again
     with pytest.raises(StorageControllerApiException):
         env.storage_controller.reconcile_all()
-    errs = wait_until(10, 1, lambda: assert_errors_gt(errs))
+    errs = wait_until(lambda: assert_errors_gt(errs))
 
     # Configure the tenant to disable reconciles
     env.storage_controller.tenant_policy_update(
@@ -1674,7 +1674,7 @@ def assert_ok_gt(n) -> int:
         return o
 
     # We should see a successful reconciliation
-    wait_until(10, 1, lambda: assert_ok_gt(0))
+    wait_until(lambda: assert_ok_gt(0))
 
     # And indeed the tenant should be attached
     assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1
@@ -2073,7 +2073,7 @@ def secondary_is_lagging():
             raise Exception(f"Secondary lag not big enough: {lag}")
 
     log.info(f"Looking for lag to develop on the secondary {secondary}")
-    wait_until(10, 1, secondary_is_lagging)
+    wait_until(secondary_is_lagging)
 
     log.info(f"Starting drain of primary {primary} with laggy secondary {secondary}")
     env.storage_controller.retryable_node_operation(
@@ -2107,7 +2107,7 @@ def lag_is_acceptable():
         if lag > 1 * 1024 * 1024:
             raise Exception(f"Secondary lag not big enough: {lag}")
 
-    wait_until(10, 1, lag_is_acceptable)
+    wait_until(lag_is_acceptable)
 
     env.storage_controller.node_configure(primary, {"scheduling": "Active"})
 
@@ -2227,7 +2227,7 @@ def assert_shards_migrated():
             log.info(f"Shards on nodes other than on victim: {elsewhere}")
             assert elsewhere == tenant_count * shard_count_per_tenant
 
-        wait_until(30, 1, assert_shards_migrated)
+        wait_until(assert_shards_migrated)
 
     log.info(f"Deleting pageserver {victim.id}")
     env.storage_controller.node_delete(victim.id)
@@ -2240,7 +2240,7 @@ def assert_victim_evacuated():
             log.info(f"Shards on node {victim.id}: {count}")
             assert count == 0
 
-        wait_until(30, 1, assert_victim_evacuated)
+        wait_until(assert_victim_evacuated)
 
     # The node should be gone from the list API
     assert victim.id not in [n["id"] for n in env.storage_controller.node_list()]
@@ -2569,7 +2569,7 @@ def previous_stepped_down():
                 == StorageControllerLeadershipStatus.STEPPED_DOWN
             )
 
-        wait_until(5, 1, previous_stepped_down)
+        wait_until(previous_stepped_down)
 
     storage_controller_proxy.route_to(f"http://127.0.0.1:{storage_controller_2_port}")
 
@@ -2579,7 +2579,7 @@ def new_becomes_leader():
             == StorageControllerLeadershipStatus.LEADER
         )
 
-    wait_until(15, 1, new_becomes_leader)
+    wait_until(new_becomes_leader)
     leader = env.storage_controller.get_leader()
     assert leader["address"] == f"http://127.0.0.1:{storage_controller_2_port}/"
 
@@ -2624,7 +2624,7 @@ def attached_is_draining():
     env.storage_controller.configure_failpoints(("sleepy-drain-loop", "return(10000)"))
     env.storage_controller.node_drain(attached.id)
 
-    wait_until(10, 0.5, attached_is_draining)
+    wait_until(attached_is_draining)
 
     attached.restart()
 
@@ -2646,7 +2646,7 @@ def reconfigure_node_again():
         env.storage_controller.node_configure(attached.id, {"scheduling": "Pause"})
 
     # allow for small delay between actually having cancelled and being able reconfigure again
-    wait_until(4, 0.5, reconfigure_node_again)
+    wait_until(reconfigure_node_again)
 
 
 def test_storage_controller_timeline_crud_race(neon_env_builder: NeonEnvBuilder):
@@ -2691,7 +2691,7 @@ def has_hit_failpoint():
                 ps.log_contains(f"at failpoint {failpoint}") is not None for ps in env.pageservers
             )
 
-        wait_until(10, 1, has_hit_failpoint)
+        wait_until(has_hit_failpoint)
 
         # Migrate the tenant while the timeline creation is in progress: this migration will complete once it
         # can detach from the old pageserver, which will happen once the failpoint completes.
@@ -2775,7 +2775,7 @@ def test_storage_controller_validate_during_migration(neon_env_builder: NeonEnvB
             def has_hit_compaction_failpoint():
                 assert origin_pageserver.log_contains(f"at failpoint {compaction_failpoint}")
 
-            wait_until(10, 1, has_hit_compaction_failpoint)
+            wait_until(has_hit_compaction_failpoint)
 
             # While the compaction is running, start a live migration which will pause long enough for the compaction to sleep,
             # after incrementing generation and attaching the new location
@@ -2794,7 +2794,7 @@ def has_hit_migration_failpoint():
             # before it reaches this point.  The timeout is because the AttachedStale transition includes
             # a flush of remote storage, and if the compaction already enqueued an index upload this cannot
             # make progress.
-            wait_until(60, 1, has_hit_migration_failpoint)
+            wait_until(has_hit_migration_failpoint, timeout=60)
 
             # Origin pageserver has succeeded with compaction before the migration completed. It has done all the writes it wanted to do in its own (stale) generation
             origin_pageserver.http_client().configure_failpoints((compaction_failpoint, "off"))
@@ -2917,7 +2917,7 @@ def has_hit_migration_failpoint():
                 log.info(expr)
                 assert env.storage_controller.log_contains(expr)
 
-            wait_until(10, 1, has_hit_migration_failpoint)
+            wait_until(has_hit_migration_failpoint)
 
             # This request should be routed to whichever pageserver holds the highest generation
             tenant_info = env.storage_controller.pageserver_api().tenant_status(
@@ -2934,7 +2934,7 @@ def has_hit_migration_failpoint():
                 # We expect request to land on the origin
                 assert tenant_info["generation"] == 1
 
-            wait_until(10, 1, long_migration_metric_published)
+            wait_until(long_migration_metric_published)
 
             # Eventually migration completes
             env.storage_controller.configure_failpoints((migration_failpoint.value, "off"))
@@ -3113,7 +3113,7 @@ def has_hit_migration_failpoint():
                 log.info(expr)
                 assert env.storage_controller.log_contains(expr)
 
-            wait_until(10, 1, has_hit_migration_failpoint)
+            wait_until(has_hit_migration_failpoint)
 
             env.storage_controller.pageserver_api().timeline_delete(
                 tenant_id=tenant_id, timeline_id=timeline_id
@@ -3182,7 +3182,7 @@ def has_hit_migration_failpoint():
                 log.info(expr)
                 assert env.storage_controller.log_contains(expr)
 
-            wait_until(10, 1, has_hit_migration_failpoint)
+            wait_until(has_hit_migration_failpoint)
 
             timeline_id = TimelineId.generate()
             env.storage_controller.pageserver_api().timeline_create(
diff --git a/test_runner/regress/test_storage_scrubber.py b/test_runner/regress/test_storage_scrubber.py
index 3991bd7061b9..b16dc54c248b 100644
--- a/test_runner/regress/test_storage_scrubber.py
+++ b/test_runner/regress/test_storage_scrubber.py
@@ -431,8 +431,6 @@ def stuck_split():
 
         # Let the controller reach the failpoint
         wait_until(
-            10,
-            1,
             lambda: env.storage_controller.assert_log_contains(
                 'failpoint "shard-split-post-remote-sleep": sleeping'
             ),
diff --git a/test_runner/regress/test_subscriber_restart.py b/test_runner/regress/test_subscriber_restart.py
index d37eeb1e6ebb..7d4f66d04448 100644
--- a/test_runner/regress/test_subscriber_restart.py
+++ b/test_runner/regress/test_subscriber_restart.py
@@ -56,4 +56,4 @@ def insert_data(pub):
         pcur.execute(f"INSERT into t values ({n_records}, 0)")
         n_records += 1
         with sub.cursor() as scur:
-            wait_until(60, 0.5, check_that_changes_propagated)
+            wait_until(check_that_changes_propagated)
diff --git a/test_runner/regress/test_tenant_conf.py b/test_runner/regress/test_tenant_conf.py
index 1dd46ec3d111..f8f240cfdcc4 100644
--- a/test_runner/regress/test_tenant_conf.py
+++ b/test_runner/regress/test_tenant_conf.py
@@ -234,11 +234,7 @@ def test_creating_tenant_conf_after_attach(neon_env_builder: NeonEnvBuilder):
     assert not config_path.exists(), "detach did not remove config file"
 
     env.pageserver.tenant_attach(tenant_id)
-    wait_until(
-        number_of_iterations=5,
-        interval=1,
-        func=lambda: assert_tenant_state(http_client, tenant_id, "Active"),
-    )
+    wait_until(lambda: assert_tenant_state(http_client, tenant_id, "Active"))
 
     env.config_tenant(tenant_id, {"gc_horizon": "1000000"})
     contents_first = config_path.read_text()
diff --git a/test_runner/regress/test_tenant_delete.py b/test_runner/regress/test_tenant_delete.py
index 47df3ead7020..48e55c1ab15b 100644
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -185,21 +185,21 @@ def tenant_is_deleted():
     deletion = None
 
     try:
-        wait_until(10, 1, has_hit_failpoint)
+        wait_until(has_hit_failpoint)
 
         # it should start ok, sync up with the stuck creation, then hang waiting for the timeline
         # to shut down.
         deletion = Thread(target=start_deletion)
         deletion.start()
 
-        wait_until(10, 1, deletion_has_started_waiting_for_timelines)
+        wait_until(deletion_has_started_waiting_for_timelines)
 
         pageserver_http.configure_failpoints((failpoint, "off"))
 
         creation.join()
         deletion.join()
 
-        wait_until(10, 1, tenant_is_deleted)
+        wait_until(tenant_is_deleted)
     finally:
         creation.join()
         if deletion is not None:
@@ -264,7 +264,7 @@ def timeline_create():
     def hit_initdb_upload_failpoint():
         env.pageserver.assert_log_contains(f"at failpoint {BEFORE_INITDB_UPLOAD_FAILPOINT}")
 
-    wait_until(100, 0.1, hit_initdb_upload_failpoint)
+    wait_until(hit_initdb_upload_failpoint)
 
     def creation_connection_timed_out():
         env.pageserver.assert_log_contains(
@@ -273,7 +273,7 @@ def creation_connection_timed_out():
 
     # Wait so that we hit the timeout and the connection is dropped
     # (But timeline creation still continues)
-    wait_until(100, 0.1, creation_connection_timed_out)
+    wait_until(creation_connection_timed_out)
 
     ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "pause"))
 
@@ -281,7 +281,7 @@ def tenant_delete():
         def tenant_delete_inner():
             ps_http.tenant_delete(tenant_id)
 
-        wait_until(100, 0.5, tenant_delete_inner)
+        wait_until(tenant_delete_inner)
 
     Thread(target=tenant_delete).start()
 
@@ -290,7 +290,7 @@ def deletion_arrived():
             f"cfg failpoint: {DELETE_BEFORE_CLEANUP_FAILPOINT} pause"
         )
 
-    wait_until(100, 0.1, deletion_arrived)
+    wait_until(deletion_arrived)
 
     ps_http.configure_failpoints((DELETE_BEFORE_CLEANUP_FAILPOINT, "off"))
 
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 8d7ca7bc4e14..3f21dc895a3b 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -212,7 +212,7 @@ async def sleep_and_reattach(pageserver_http: PageserverHttpClient, tenant_id: T
         nonlocal updates_started, updates_finished, updates_to_perform
 
         # Wait until we have performed some updates
-        wait_until(20, 0.5, lambda: updates_finished > 500)
+        wait_until(lambda: updates_finished > 500)
 
         log.info("Detaching tenant")
         pageserver_http.tenant_detach(tenant_id)
@@ -512,7 +512,7 @@ def found_broken():
         )
         assert only_int(active) == 0 and only_int(broken) == 1 and only_int(broken_set) == 1
 
-    wait_until(10, 0.5, found_broken)
+    wait_until(found_broken)
 
     client.tenant_detach(env.initial_tenant)
 
@@ -524,7 +524,7 @@ def found_cleaned_up():
         )
         assert only_int(broken) == 0 and len(broken_set) == 0
 
-    wait_until(10, 0.5, found_cleaned_up)
+    wait_until(found_cleaned_up)
 
     env.pageserver.tenant_attach(env.initial_tenant)
 
@@ -536,4 +536,4 @@ def found_active():
         )
         assert only_int(active) == 1 and len(broken_set) == 0
 
-    wait_until(10, 0.5, found_active)
+    wait_until(found_active)
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index bf6120aa0aab..df53a98e926c 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -298,11 +298,7 @@ def test_tenant_relocation(
         destination_ps.tenant_attach(tenant_id)
 
         # wait for tenant to finish attaching
-        wait_until(
-            number_of_iterations=10,
-            interval=1,
-            func=lambda: assert_tenant_state(destination_http, tenant_id, "Active"),
-        )
+        wait_until(lambda: assert_tenant_state(destination_http, tenant_id, "Active"))
 
         check_timeline_attached(
             destination_http,
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 8b733da0c67f..713f89c60f6a 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -638,7 +638,7 @@ def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder):
     with ThreadPoolExecutor(max_workers=1) as exec:
         completion = exec.submit(client.tenant_size, env.initial_tenant)
         _, last_offset = wait_until(
-            10, 1.0, lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
+            lambda: env.pageserver.assert_log_contains(f"at failpoint {failpoint}")
         )
 
         timeline_delete_wait_completed(client, env.initial_tenant, branch_id)
@@ -656,8 +656,6 @@ def test_synthetic_size_while_deleting(neon_env_builder: NeonEnvBuilder):
     with ThreadPoolExecutor(max_workers=1) as exec:
         completion = exec.submit(client.tenant_size, env.initial_tenant)
         wait_until(
-            10,
-            1.0,
             lambda: env.pageserver.assert_log_contains(
                 f"at failpoint {failpoint}", offset=last_offset
             ),
diff --git a/test_runner/regress/test_tenant_tasks.py b/test_runner/regress/test_tenant_tasks.py
index 72183f5778b9..4c26b64d22b9 100644
--- a/test_runner/regress/test_tenant_tasks.py
+++ b/test_runner/regress/test_tenant_tasks.py
@@ -77,4 +77,4 @@ def assert_tasks_finish():
         assert tasks_started == tasks_ended
         assert tasks_panicked is None or int(tasks_panicked) == 0
 
-    wait_until(10, 0.2, assert_tasks_finish)
+    wait_until(assert_tasks_finish)
diff --git a/test_runner/regress/test_tenants.py b/test_runner/regress/test_tenants.py
index 158c3fddb0b3..d31901b384e5 100644
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -330,7 +330,7 @@ def not_attaching():
         assert len(tenants) == 1
         assert all(t["state"]["slug"] != "Attaching" for t in tenants)
 
-    wait_until(10, 0.2, not_attaching)
+    wait_until(not_attaching)
 
     tenants = client.tenant_list()
 
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index 8d3ddf7e54a5..6b27c41d1c3b 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -178,11 +178,7 @@ def test_tenants_attached_after_download(neon_env_builder: NeonEnvBuilder):
     env.pageserver.start()
     client = env.pageserver.http_client()
 
-    wait_until(
-        number_of_iterations=5,
-        interval=1,
-        func=lambda: assert_tenant_state(client, tenant_id, "Active"),
-    )
+    wait_until(lambda: assert_tenant_state(client, tenant_id, "Active"))
 
     restored_timelines = client.timeline_list(tenant_id)
     assert (
@@ -257,11 +253,7 @@ def test_tenant_redownloads_truncated_file_on_startup(
     env.pageserver.start()
     client = env.pageserver.http_client()
 
-    wait_until(
-        number_of_iterations=5,
-        interval=1,
-        func=lambda: assert_tenant_state(client, tenant_id, "Active"),
-    )
+    wait_until(lambda: assert_tenant_state(client, tenant_id, "Active"))
 
     restored_timelines = client.timeline_list(tenant_id)
     assert (
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index bc2e048f6942..5a1e493bbec7 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -227,8 +227,8 @@ def leaf_offloaded():
             ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=leaf_timeline_id)
         assert timeline_offloaded_logged(leaf_timeline_id)
 
-    wait_until(30, 1, leaf_offloaded)
-    wait_until(30, 1, parent_offloaded)
+    wait_until(leaf_offloaded)
+    wait_until(parent_offloaded)
 
     # Offloaded child timelines should still prevent deletion
     with pytest.raises(
@@ -331,7 +331,7 @@ def child_offloaded():
         ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=child_timeline_id)
         assert timeline_offloaded_api(child_timeline_id)
 
-    wait_until(30, 1, child_offloaded)
+    wait_until(child_offloaded)
 
     assert timeline_offloaded_api(child_timeline_id)
     assert not timeline_offloaded_api(root_timeline_id)
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index 155709e1066d..fbece6836729 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -21,7 +21,6 @@
     assert_prefix_empty,
     assert_prefix_not_empty,
     many_small_layers_tenant_config,
-    poll_for_remote_storage_iterations,
     timeline_delete_wait_completed,
     wait_for_last_record_lsn,
     wait_for_upload,
@@ -94,12 +93,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
     assert timeline_path.exists()
 
     # retry deletes when compaction or gc is running in pageserver
-    # TODO: review whether this wait_until is actually necessary, we do an await() internally
-    wait_until(
-        number_of_iterations=3,
-        interval=0.2,
-        func=lambda: timeline_delete_wait_completed(ps_http, env.initial_tenant, leaf_timeline_id),
-    )
+    timeline_delete_wait_completed(ps_http, env.initial_tenant, leaf_timeline_id)
 
     assert not timeline_path.exists()
 
@@ -111,13 +105,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
         ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id)
     assert exc.value.status_code == 404
 
-    wait_until(
-        number_of_iterations=3,
-        interval=0.2,
-        func=lambda: timeline_delete_wait_completed(
-            ps_http, env.initial_tenant, parent_timeline_id
-        ),
-    )
+    timeline_delete_wait_completed(ps_http, env.initial_tenant, parent_timeline_id)
 
     # Check that we didn't pick up the timeline again after restart.
     # See https://github.com/neondatabase/neon/issues/3560
@@ -226,8 +214,6 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
 
     ps_http.configure_failpoints((failpoint, "return"))
 
-    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
-
     # These failpoints are earlier than background task is spawned.
     # so they result in api request failure.
     if failpoint in (
@@ -244,7 +230,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
             tenant_id=env.initial_tenant,
             timeline_id=timeline_id,
             expected_state="Broken",
-            iterations=iterations,
+            iterations=40,
         )
 
         reason = timeline_info["state"]["Broken"]["reason"]
@@ -257,25 +243,21 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
         env.pageserver.stop()
         env.pageserver.start()
 
-        wait_until_tenant_active(ps_http, env.initial_tenant, iterations=iterations)
+        wait_until_tenant_active(ps_http, env.initial_tenant)
 
         if failpoint == "timeline-delete-before-index-deleted-at":
             # We crashed before persisting this to remote storage, need to retry delete request
             timeline_delete_wait_completed(ps_http, env.initial_tenant, timeline_id)
         else:
             # Pageserver should've resumed deletion after restart.
-            wait_timeline_detail_404(
-                ps_http, env.initial_tenant, timeline_id, iterations=iterations
-            )
+            wait_timeline_detail_404(ps_http, env.initial_tenant, timeline_id)
 
     elif check is Check.RETRY_WITHOUT_RESTART:
         # this should succeed
         # this also checks that delete can be retried even when timeline is in Broken state
         ps_http.configure_failpoints((failpoint, "off"))
 
-        timeline_delete_wait_completed(
-            ps_http, env.initial_tenant, timeline_id, iterations=iterations
-        )
+        timeline_delete_wait_completed(ps_http, env.initial_tenant, timeline_id)
 
     # Check remote is empty
     if remote_storage_kind is RemoteStorageKind.MOCK_S3:
@@ -378,7 +360,7 @@ def test_timeline_resurrection_on_attach(
 
     env.pageserver.tenant_attach(tenant_id=tenant_id)
 
-    wait_until_tenant_active(ps_http, tenant_id=tenant_id, iterations=10, period=0.5)
+    wait_until_tenant_active(ps_http, tenant_id=tenant_id)
 
     timelines = ps_http.timeline_list(tenant_id=tenant_id)
     assert {TimelineId(tl["timeline_id"]) for tl in timelines} == {
@@ -439,7 +421,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
     # Wait for tenant to finish loading.
     wait_until_tenant_active(ps_http, tenant_id=env.initial_tenant, iterations=10, period=1)
 
-    wait_timeline_detail_404(ps_http, env.initial_tenant, leaf_timeline_id, iterations=4)
+    wait_timeline_detail_404(ps_http, env.initial_tenant, leaf_timeline_id)
 
     assert (
         not leaf_timeline_path.exists()
@@ -481,11 +463,10 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
         )
 
     # for some reason the check above doesnt immediately take effect for the below.
-    # Assume it is mock server incosistency and check twice.
+    # Assume it is mock server incosistency and check a few times.
     wait_until(
-        2,
-        0.5,
         lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage),
+        timeout=2,
     )
 
     # We deleted our only tenant, and the scrubber fails if it detects nothing
@@ -544,7 +525,7 @@ def first_call_hit_failpoint():
                 f".*{child_timeline_id}.*at failpoint {stuck_failpoint}"
             )
 
-        wait_until(50, 0.1, first_call_hit_failpoint)
+        wait_until(first_call_hit_failpoint, interval=0.1, status_interval=1.0)
 
         # make the second call and assert behavior
         log.info("second call start")
@@ -613,7 +594,7 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
     def hit_failpoint():
         env.pageserver.assert_log_contains(at_failpoint_log_message)
 
-    wait_until(50, 0.1, hit_failpoint)
+    wait_until(hit_failpoint, interval=0.1)
 
     # we log this error if a client hangs up
     # might as well use it as another indicator that the test works
@@ -623,7 +604,7 @@ def hit_failpoint():
     def got_hangup_log_message():
         env.pageserver.assert_log_contains(hangup_log_message)
 
-    wait_until(50, 0.1, got_hangup_log_message)
+    wait_until(got_hangup_log_message, interval=0.1)
 
     # check that the timeline is still present
     ps_http.timeline_detail(env.initial_tenant, child_timeline_id)
@@ -635,10 +616,10 @@ def first_request_finished():
         message = f".*DELETE.*{child_timeline_id}.*Cancelled request finished"
         env.pageserver.assert_log_contains(message)
 
-    wait_until(50, 0.1, first_request_finished)
+    wait_until(first_request_finished, interval=0.1)
 
     # check that the timeline is gone
-    wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id, iterations=10)
+    wait_timeline_detail_404(ps_http, env.initial_tenant, child_timeline_id)
 
 
 def test_timeline_delete_works_for_remote_smoke(
@@ -707,7 +688,7 @@ def test_timeline_delete_works_for_remote_smoke(
 
     # for some reason the check above doesnt immediately take effect for the below.
     # Assume it is mock server inconsistency and check twice.
-    wait_until(2, 0.5, lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage))
+    wait_until(lambda: assert_prefix_empty(neon_env_builder.pageserver_remote_storage))
 
     # We deleted our only tenant, and the scrubber fails if it detects nothing
     neon_env_builder.disable_scrub_on_exit()
@@ -753,15 +734,13 @@ def test_delete_orphaned_objects(
 
     env.pageserver.allowed_errors.append(f".*failpoint: {failpoint}")
 
-    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
-
     ps_http.timeline_delete(env.initial_tenant, timeline_id)
     timeline_info = wait_until_timeline_state(
         pageserver_http=ps_http,
         tenant_id=env.initial_tenant,
         timeline_id=timeline_id,
         expected_state="Broken",
-        iterations=iterations,
+        iterations=40,
     )
 
     reason = timeline_info["state"]["Broken"]["reason"]
@@ -827,8 +806,6 @@ def test_timeline_delete_resumed_on_attach(
         )
     )
 
-    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
-
     ps_http.timeline_delete(tenant_id, timeline_id)
 
     timeline_info = wait_until_timeline_state(
@@ -836,7 +813,7 @@ def test_timeline_delete_resumed_on_attach(
         tenant_id=env.initial_tenant,
         timeline_id=timeline_id,
         expected_state="Broken",
-        iterations=iterations,
+        iterations=40,
     )
 
     reason = timeline_info["state"]["Broken"]["reason"]
@@ -871,7 +848,7 @@ def test_timeline_delete_resumed_on_attach(
     env.pageserver.tenant_attach(tenant_id=tenant_id)
 
     # delete should be resumed
-    wait_timeline_detail_404(ps_http, env.initial_tenant, timeline_id, iterations=iterations)
+    wait_timeline_detail_404(ps_http, env.initial_tenant, timeline_id)
 
     tenant_path = env.pageserver.timeline_dir(tenant_id, timeline_id)
     assert not tenant_path.exists()
diff --git a/test_runner/regress/test_timeline_detach_ancestor.py b/test_runner/regress/test_timeline_detach_ancestor.py
index 9c7e851ba87b..2c3ee38baef1 100644
--- a/test_runner/regress/test_timeline_detach_ancestor.py
+++ b/test_runner/regress/test_timeline_detach_ancestor.py
@@ -203,7 +203,7 @@ def test_ancestor_detach_branched_from(
     )
 
     client.timeline_delete(env.initial_tenant, env.initial_timeline)
-    wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0)
+    wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline)
 
     # because we do the fullbackup from ancestor at the branch_lsn, the zenith.signal is always different
     # as there is always "PREV_LSN: invalid" for "before"
@@ -336,10 +336,10 @@ def test_ancestor_detach_reparents_earlier(neon_env_builder: NeonEnvBuilder):
 
     # delete the timelines to confirm detach actually worked
     client.timeline_delete(env.initial_tenant, after)
-    wait_timeline_detail_404(client, env.initial_tenant, after, 10, 1.0)
+    wait_timeline_detail_404(client, env.initial_tenant, after)
 
     client.timeline_delete(env.initial_tenant, env.initial_timeline)
-    wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline, 10, 1.0)
+    wait_timeline_detail_404(client, env.initial_tenant, env.initial_timeline)
 
 
 def test_detached_receives_flushes_while_being_detached(neon_env_builder: NeonEnvBuilder):
@@ -973,17 +973,17 @@ def is_deleted():
     with ThreadPoolExecutor(max_workers=2) as pool:
         try:
             fut = pool.submit(detach_ancestor)
-            offset = wait_until(10, 1.0, at_failpoint)
+            offset = wait_until(at_failpoint)
 
             delete = pool.submit(start_delete)
 
-            offset = wait_until(10, 1.0, lambda: at_waiting_on_gate_close(offset))
+            offset = wait_until(lambda: at_waiting_on_gate_close(offset))
 
             victim_http.configure_failpoints((pausepoint, "off"))
 
             delete.result()
 
-            assert wait_until(10, 1.0, is_deleted), f"unimplemented mode {mode}"
+            assert wait_until(is_deleted), f"unimplemented mode {mode}"
 
             # TODO: match the error
             with pytest.raises(PageserverApiException) as exc:
@@ -1115,11 +1115,11 @@ def first_branch_gone():
     with ThreadPoolExecutor(max_workers=1) as pool:
         try:
             fut = pool.submit(detach_timeline)
-            wait_until(10, 1.0, paused_at_failpoint)
+            wait_until(paused_at_failpoint)
 
             # let stuck complete
             stuck_http.configure_failpoints((pausepoint, "off"))
-            wait_until(10, 1.0, first_completed)
+            wait_until(first_completed)
 
             if mode == "delete_reparentable_timeline":
                 assert first_branch is not None
@@ -1127,7 +1127,7 @@ def first_branch_gone():
                     env.initial_tenant, first_branch
                 )
                 victim_http.configure_failpoints((pausepoint, "off"))
-                wait_until(10, 1.0, first_branch_gone)
+                wait_until(first_branch_gone)
             elif mode == "create_reparentable_timeline":
                 first_branch = create_reparentable_timeline()
                 victim_http.configure_failpoints((pausepoint, "off"))
@@ -1271,11 +1271,11 @@ def first_completed():
     with ThreadPoolExecutor(max_workers=1) as pool:
         try:
             fut = pool.submit(detach_timeline)
-            wait_until(10, 1.0, paused_at_failpoint)
+            wait_until(paused_at_failpoint)
 
             # let stuck complete
             stuck_http.configure_failpoints((pausepoint, "off"))
-            wait_until(10, 1.0, first_completed)
+            wait_until(first_completed)
 
             victim_http.configure_failpoints((pausepoint, "off"))
 
@@ -1456,7 +1456,7 @@ def try_detach():
     # other tests take the "detach? reparent complete", but this only hits
     # "complete".
     http.timeline_delete(env.initial_tenant, env.initial_timeline)
-    wait_timeline_detail_404(http, env.initial_tenant, env.initial_timeline, 20)
+    wait_timeline_detail_404(http, env.initial_tenant, env.initial_timeline)
 
     http.configure_failpoints(("timeline-detach-ancestor::complete_before_uploading", "off"))
 
@@ -1518,7 +1518,7 @@ def delete_detached():
         with ThreadPoolExecutor(max_workers=1) as pool:
             detach = pool.submit(detach_and_get_stuck)
 
-            offset = wait_until(10, 1.0, request_processing_noted_in_log)
+            offset = wait_until(request_processing_noted_in_log)
 
             # make this named fn tor more clear failure test output logging
             def pausepoint_hit_with_gc_paused() -> LogCursor:
@@ -1529,11 +1529,11 @@ def pausepoint_hit_with_gc_paused() -> LogCursor:
                 )
                 return at
 
-            offset = wait_until(10, 1.0, pausepoint_hit_with_gc_paused)
+            offset = wait_until(pausepoint_hit_with_gc_paused)
 
             delete_detached()
 
-            wait_timeline_detail_404(http, env.initial_tenant, detached, 10, 1.0)
+            wait_timeline_detail_404(http, env.initial_tenant, detached)
 
             http.configure_failpoints((failpoint, "off"))
 
diff --git a/test_runner/regress/test_timeline_gc_blocking.py b/test_runner/regress/test_timeline_gc_blocking.py
index 5a5ca3290a07..7605e1f758b9 100644
--- a/test_runner/regress/test_timeline_gc_blocking.py
+++ b/test_runner/regress/test_timeline_gc_blocking.py
@@ -61,7 +61,7 @@ def test_gc_blocking_by_timeline(neon_env_builder: NeonEnvBuilder, sharded: bool
 
     # deletion unblocks gc
     http.timeline_delete(env.initial_tenant, foo_branch)
-    wait_timeline_detail_404(http, env.initial_tenant, foo_branch, 10, 1.0)
+    wait_timeline_detail_404(http, env.initial_tenant, foo_branch)
 
     wait_for_another_gc_round()
     pss.assert_log_contains(gc_active_line)
diff --git a/test_runner/regress/test_timeline_size.py b/test_runner/regress/test_timeline_size.py
index 4528bc618044..95bf9106cd94 100644
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -396,11 +396,7 @@ def test_timeline_physical_size_init(neon_env_builder: NeonEnvBuilder):
 
     # Wait for the tenant to be loaded
     client = env.pageserver.http_client()
-    wait_until(
-        number_of_iterations=5,
-        interval=1,
-        func=lambda: assert_tenant_state(client, env.initial_tenant, "Active"),
-    )
+    wait_until(lambda: assert_tenant_state(client, env.initial_tenant, "Active"))
 
     assert_physical_size_invariants(
         get_physical_size_values(env, env.initial_tenant, new_timeline_id),
@@ -433,7 +429,7 @@ def check():
             get_physical_size_values(env, env.initial_tenant, new_timeline_id),
         )
 
-    wait_until(10, 1, check)
+    wait_until(check)
 
 
 def test_timeline_physical_size_post_compaction(neon_env_builder: NeonEnvBuilder):
@@ -721,7 +717,7 @@ def wait_for_tenant_startup_completions(client: PageserverHttpClient, count: int
     def condition():
         assert client.get_metric_value("pageserver_tenant_startup_complete_total") == count
 
-    wait_until(5, 1.0, condition)
+    wait_until(condition)
 
 
 def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
@@ -768,7 +764,7 @@ def at_least_one_active():
         assert "Active" in set(get_tenant_states().values())
 
     # One tenant should activate, then get stuck in their logical size calculation
-    wait_until(10, 1, at_least_one_active)
+    wait_until(at_least_one_active)
 
     # Wait some walltime to gain confidence that other tenants really are stuck and not proceeding to activate
     time.sleep(5)
@@ -836,13 +832,13 @@ def at_least_one_active():
     def all_active():
         assert all(s == "Active" for s in get_tenant_states().values())
 
-    wait_until(10, 1, all_active)
+    wait_until(all_active)
 
     # Final control check: restarting with no failpoints at all results in all tenants coming active
     # without being prompted by client I/O
     env.pageserver.stop()
     env.pageserver.start()
-    wait_until(10, 1, all_active)
+    wait_until(all_active)
 
     assert (
         pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants
@@ -856,7 +852,7 @@ def all_active():
         extra_env_vars={"FAILPOINTS": "timeline-calculate-logical-size-pause=pause"}
     )
 
-    wait_until(10, 1, at_least_one_active)
+    wait_until(at_least_one_active)
 
     detach_tenant_id = list(
         [(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"]
@@ -881,7 +877,7 @@ def all_active():
 
     # Check that all the stuck tenants proceed to active (apart from the one that deletes, and the one
     # we detached)
-    wait_until(10, 1, all_active)
+    wait_until(all_active)
     assert len(get_tenant_states()) == n_tenants - 2
 
 
@@ -908,7 +904,7 @@ def delete_tenant():
         try:
             # Deletion will get to the point in shutdown where it's waiting for timeline shutdown, then
             # hang because of our failpoint blocking activation.
-            wait_until(10, 1, shutting_down)
+            wait_until(shutting_down)
         finally:
             log.info("Clearing failpoint")
             pageserver_http.configure_failpoints(("timeline-calculate-logical-size-pause", "off"))
@@ -1030,13 +1026,13 @@ def one_is_active():
         log.info(f"{states}")
         assert len(states["Active"]) == 1
 
-    wait_until(10, 1, one_is_active)
+    wait_until(one_is_active)
 
     def other_is_attaching():
         states = get_tenant_states()
         assert len(states["Attaching"]) == 1
 
-    wait_until(10, 1, other_is_attaching)
+    wait_until(other_is_attaching)
 
     def eager_tenant_is_active():
         resp = client.tenant_status(eager_tenant)
@@ -1053,7 +1049,7 @@ def eager_tenant_is_active():
         },
         lazy=False,
     )
-    wait_until(10, 1, eager_tenant_is_active)
+    wait_until(eager_tenant_is_active)
 
     other_is_attaching()
 
@@ -1096,7 +1092,7 @@ def initial_tenant_is_active():
         resp = client.tenant_status(env.initial_tenant)
         assert resp["state"]["slug"] == "Active"
 
-    wait_until(10, 1, initial_tenant_is_active)
+    wait_until(initial_tenant_is_active)
 
     # even though the initial tenant is now active, because it was startup time
     # attach, it will consume the only permit because logical size calculation
@@ -1119,7 +1115,7 @@ def lazy_tenant_is_attaching():
         assert resp["state"]["slug"] == "Attaching"
 
     # paused logical size calculation of env.initial_tenant is keeping it attaching
-    wait_until(10, 1, lazy_tenant_is_attaching)
+    wait_until(lazy_tenant_is_attaching)
 
     for _ in range(5):
         lazy_tenant_is_attaching()
@@ -1132,10 +1128,10 @@ def lazy_tenant_is_active():
     if activation_method == "endpoint":
         with env.endpoints.create_start("main", tenant_id=lazy_tenant):
             # starting up the endpoint should make it jump the queue
-            wait_until(10, 1, lazy_tenant_is_active)
+            wait_until(lazy_tenant_is_active)
     elif activation_method == "branch":
         env.create_timeline("second_branch", lazy_tenant)
-        wait_until(10, 1, lazy_tenant_is_active)
+        wait_until(lazy_tenant_is_active)
     elif activation_method == "delete":
         delete_lazy_activating(lazy_tenant, env.pageserver, expect_attaching=True)
     else:
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index 8fa33b81a9dc..23d4f23cdb84 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -2136,7 +2136,7 @@ def evicted_on_source():
         # Check that on source no segment files are present
         assert src_sk.list_segments(tenant_id, timeline_id) == []
 
-    wait_until(60, 1, evicted_on_source)
+    wait_until(evicted_on_source, timeout=60)
 
     # Invoke pull_timeline: source should serve snapshot request without promoting anything to local disk,
     # destination should import the control file only & go into evicted mode immediately
@@ -2155,7 +2155,7 @@ def evicted_on_destination():
 
     # This should be fast, it is a wait_until because eviction state is updated
     # in the background wrt pull_timeline.
-    wait_until(10, 0.1, evicted_on_destination)
+    wait_until(evicted_on_destination, timeout=1.0, interval=0.1)
 
     # Delete the timeline on the source, to prove that deletion works on an
     # evicted timeline _and_ that the final compute test is really not using
@@ -2178,7 +2178,7 @@ def unevicted_on_dest():
         n_evicted = dst_sk.http_client().get_metric_value("safekeeper_evicted_timelines")
         assert n_evicted == 0
 
-    wait_until(10, 1, unevicted_on_dest)
+    wait_until(unevicted_on_dest, interval=0.1, timeout=1.0)
 
 
 # In this test we check for excessive START_REPLICATION and START_WAL_PUSH queries
@@ -2606,10 +2606,10 @@ def all_evicted():
         assert n_evicted  # make mypy happy
         assert int(n_evicted) == n_timelines
 
-    wait_until(60, 0.5, all_evicted)
+    wait_until(all_evicted, timeout=30)
     # restart should preserve the metric value
     sk.stop().start()
-    wait_until(60, 0.5, all_evicted)
+    wait_until(all_evicted)
     # and endpoint start should reduce is
     endpoints[0].start()
 
@@ -2618,7 +2618,7 @@ def one_unevicted():
         assert n_evicted  # make mypy happy
         assert int(n_evicted) < n_timelines
 
-    wait_until(60, 0.5, one_unevicted)
+    wait_until(one_unevicted)
 
 
 # Test resetting uploaded partial segment state.
@@ -2666,7 +2666,7 @@ def evicted():
         if isinstance(eviction_state, str) and eviction_state == "Present":
             raise Exception("eviction didn't happen yet")
 
-    wait_until(30, 1, evicted)
+    wait_until(evicted)
     # it must have uploaded something
     uploaded_segs = sk.list_uploaded_segments(tenant_id, timeline_id)
     log.info(f"uploaded segments before reset: {uploaded_segs}")
@@ -2763,7 +2763,7 @@ def source_partial_segment_uploaded():
 
         raise Exception("Partial segment not uploaded yet")
 
-    source_partial_segment = wait_until(15, 1, source_partial_segment_uploaded)
+    source_partial_segment = wait_until(source_partial_segment_uploaded)
     log.info(
         f"Uploaded segments before pull are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}"
     )
@@ -2787,7 +2787,7 @@ def evicted():
         if evictions is None or evictions == 0:
             raise Exception("Eviction did not happen on source safekeeper yet")
 
-    wait_until(30, 1, evicted)
+    wait_until(evicted)
 
     endpoint.start(safekeepers=[2, 3])
 
@@ -2804,7 +2804,7 @@ def new_partial_segment_uploaded():
     )
 
     endpoint.safe_psql("insert into t select generate_series(1, 1000), 'pear'")
-    wait_until(15, 1, new_partial_segment_uploaded)
+    wait_until(new_partial_segment_uploaded)
 
     log.info(
         f"Uploaded segments after post-pull ingest are {src_sk.list_uploaded_segments(tenant_id, timeline_id)}"
@@ -2833,4 +2833,4 @@ def unevicted():
         if unevictions is None or unevictions == 0:
             raise Exception("Uneviction did not happen on source safekeeper yet")
 
-    wait_until(10, 1, unevicted)
+    wait_until(unevicted)
diff --git a/test_runner/regress/test_wal_receiver.py b/test_runner/regress/test_wal_receiver.py
index 294f86ffa720..d22a900c5923 100644
--- a/test_runner/regress/test_wal_receiver.py
+++ b/test_runner/regress/test_wal_receiver.py
@@ -97,7 +97,7 @@ def all_sks_in_wareceiver_state():
                     str(safekeeper.id) in exception_string
                 ), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after WAL wait timeout"
 
-    wait_until(60, 0.5, all_sks_in_wareceiver_state)
+    wait_until(all_sks_in_wareceiver_state, timeout=30)
 
     stopped_safekeeper = env.safekeepers[-1]
     stopped_safekeeper_id = stopped_safekeeper.id
@@ -124,7 +124,7 @@ def all_but_stopped_sks_in_wareceiver_state():
                         str(safekeeper.id) in exception_string
                     ), f"Should have safekeeper {safekeeper.id} printed in walreceiver state after 2nd WAL wait timeout"
 
-    wait_until(60, 0.5, all_but_stopped_sks_in_wareceiver_state)
+    wait_until(all_but_stopped_sks_in_wareceiver_state, timeout=30)
 
 
 def insert_test_elements(env: NeonEnv, tenant_id: TenantId, start: int, count: int):

From bd0936919885f130e3d6aedf42ba3ca7047c56e4 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Mon, 2 Dec 2024 11:50:22 +0000
Subject: [PATCH 03/65] storcon: add metric for AZ scheduling violations
 (#9949)

## Problem

We can't easily tell how far the state of shards is from their AZ
preferences. This can be a cause of performance issues, so it's
important for diagnosability that we can tell easily if there are
significant numbers of shards that aren't running in their preferred AZ.

Related: https://github.com/neondatabase/cloud/issues/15413

## Summary of changes

- In reconcile_all, count shards that are scheduled into the wrong AZ
(if they have a preference), and publish it as a prometheus gauge.
- Also calculate a statistic for how many shards wanted to reconcile but
couldn't.

This is clearly a lazy calculation: reconcile all only runs
periodically. But that's okay: shards in the wrong AZ is something that
only matters if it stays that way for some period of time.
---
 storage_controller/src/metrics.rs |  6 ++++++
 storage_controller/src/service.rs | 32 +++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

diff --git a/storage_controller/src/metrics.rs b/storage_controller/src/metrics.rs
index a1f7bc24575b..6d5885eba657 100644
--- a/storage_controller/src/metrics.rs
+++ b/storage_controller/src/metrics.rs
@@ -50,6 +50,12 @@ pub(crate) struct StorageControllerMetricGroup {
     /// Count of how many times we make an optimization change to a tenant's scheduling
     pub(crate) storage_controller_schedule_optimization: measured::Counter,
 
+    /// How many shards are not scheduled into their preferred AZ
+    pub(crate) storage_controller_schedule_az_violation: measured::Gauge,
+
+    /// How many shards would like to reconcile but were blocked by concurrency limits
+    pub(crate) storage_controller_pending_reconciles: measured::Gauge,
+
     /// HTTP request status counters for handled requests
     pub(crate) storage_controller_http_request_status:
         measured::CounterVec<HttpRequestStatusLabelGroupSet>,
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 636ccf11a120..631fdb49239c 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -6016,14 +6016,33 @@ impl Service {
         let (nodes, tenants, _scheduler) = locked.parts_mut();
         let pageservers = nodes.clone();
 
+        // This function is an efficient place to update lazy statistics, since we are walking
+        // all tenants.
+        let mut pending_reconciles = 0;
+        let mut az_violations = 0;
+
         let mut reconciles_spawned = 0;
         for shard in tenants.values_mut() {
+            // Accumulate scheduling statistics
+            if let (Some(attached), Some(preferred)) =
+                (shard.intent.get_attached(), shard.preferred_az())
+            {
+                let node_az = nodes
+                    .get(attached)
+                    .expect("Nodes exist if referenced")
+                    .get_availability_zone_id();
+                if node_az != preferred {
+                    az_violations += 1;
+                }
+            }
+
             // Skip checking if this shard is already enqueued for reconciliation
             if shard.delayed_reconcile && self.reconciler_concurrency.available_permits() == 0 {
                 // If there is something delayed, then return a nonzero count so that
                 // callers like reconcile_all_now do not incorrectly get the impression
                 // that the system is in a quiescent state.
                 reconciles_spawned = std::cmp::max(1, reconciles_spawned);
+                pending_reconciles += 1;
                 continue;
             }
 
@@ -6031,9 +6050,22 @@ impl Service {
             // dirty, spawn another rone
             if self.maybe_reconcile_shard(shard, &pageservers).is_some() {
                 reconciles_spawned += 1;
+            } else if shard.delayed_reconcile {
+                // Shard wanted to reconcile but for some reason couldn't.
+                pending_reconciles += 1;
             }
         }
 
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_schedule_az_violation
+            .set(az_violations as i64);
+
+        metrics::METRICS_REGISTRY
+            .metrics_group
+            .storage_controller_pending_reconciles
+            .set(pending_reconciles as i64);
+
         reconciles_spawned
     }
 

From cd1d2d19968b197d60f122454e83cfe485af1f7d Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 2 Dec 2024 12:29:57 +0000
Subject: [PATCH 04/65] fix(proxy): forward notifications from authentication
 (#9948)

Fixes https://github.com/neondatabase/cloud/issues/20973.

This refactors `connect_raw` in order to return direct access to the
delayed notices.

I cannot find a way to test this with psycopg2 unfortunately, although
testing it with psql does return the expected results.
---
 libs/pq_proto/src/lib.rs                      |  6 +++
 .../postgres-protocol2/src/message/backend.rs |  4 ++
 .../proxy/tokio-postgres2/src/cancel_token.rs |  8 ++--
 libs/proxy/tokio-postgres2/src/client.rs      | 13 +++---
 libs/proxy/tokio-postgres2/src/config.rs      |  6 +--
 libs/proxy/tokio-postgres2/src/connect.rs     | 42 +++++++++++++++----
 libs/proxy/tokio-postgres2/src/connect_raw.rs | 37 +++++++++-------
 libs/proxy/tokio-postgres2/src/lib.rs         |  5 ++-
 proxy/src/compute.rs                          | 38 ++++++++++++-----
 proxy/src/proxy/mod.rs                        |  8 ++--
 proxy/src/proxy/tests/mod.rs                  |  8 ++--
 11 files changed, 117 insertions(+), 58 deletions(-)

diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index 4b0331999d33..43dfbc22a45a 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -565,6 +565,8 @@ pub enum BeMessage<'a> {
     /// Batch of interpreted, shard filtered WAL records,
     /// ready for the pageserver to ingest
     InterpretedWalRecords(InterpretedWalRecordsBody<'a>),
+
+    Raw(u8, &'a [u8]),
 }
 
 /// Common shorthands.
@@ -754,6 +756,10 @@ impl BeMessage<'_> {
     /// one more buffer.
     pub fn write(buf: &mut BytesMut, message: &BeMessage) -> Result<(), ProtocolError> {
         match message {
+            BeMessage::Raw(code, data) => {
+                buf.put_u8(*code);
+                write_body(buf, |b| b.put_slice(data))
+            }
             BeMessage::AuthenticationOk => {
                 buf.put_u8(b'R');
                 write_body(buf, |buf| {
diff --git a/libs/proxy/postgres-protocol2/src/message/backend.rs b/libs/proxy/postgres-protocol2/src/message/backend.rs
index 356d142f3fc8..33d77fc25261 100644
--- a/libs/proxy/postgres-protocol2/src/message/backend.rs
+++ b/libs/proxy/postgres-protocol2/src/message/backend.rs
@@ -541,6 +541,10 @@ impl NoticeResponseBody {
     pub fn fields(&self) -> ErrorFields<'_> {
         ErrorFields { buf: &self.storage }
     }
+
+    pub fn as_bytes(&self) -> &[u8] {
+        &self.storage
+    }
 }
 
 pub struct NotificationResponseBody {
diff --git a/libs/proxy/tokio-postgres2/src/cancel_token.rs b/libs/proxy/tokio-postgres2/src/cancel_token.rs
index b949bf358f37..a10e8bf5c3a4 100644
--- a/libs/proxy/tokio-postgres2/src/cancel_token.rs
+++ b/libs/proxy/tokio-postgres2/src/cancel_token.rs
@@ -10,10 +10,10 @@ use tokio::net::TcpStream;
 /// connection.
 #[derive(Clone)]
 pub struct CancelToken {
-    pub(crate) socket_config: Option<SocketConfig>,
-    pub(crate) ssl_mode: SslMode,
-    pub(crate) process_id: i32,
-    pub(crate) secret_key: i32,
+    pub socket_config: Option<SocketConfig>,
+    pub ssl_mode: SslMode,
+    pub process_id: i32,
+    pub secret_key: i32,
 }
 
 impl CancelToken {
diff --git a/libs/proxy/tokio-postgres2/src/client.rs b/libs/proxy/tokio-postgres2/src/client.rs
index 96200b71e73b..a7cd53afc35d 100644
--- a/libs/proxy/tokio-postgres2/src/client.rs
+++ b/libs/proxy/tokio-postgres2/src/client.rs
@@ -138,7 +138,7 @@ impl InnerClient {
 }
 
 #[derive(Clone)]
-pub(crate) struct SocketConfig {
+pub struct SocketConfig {
     pub host: Host,
     pub port: u16,
     pub connect_timeout: Option<Duration>,
@@ -152,7 +152,7 @@ pub(crate) struct SocketConfig {
 pub struct Client {
     inner: Arc<InnerClient>,
 
-    socket_config: Option<SocketConfig>,
+    socket_config: SocketConfig,
     ssl_mode: SslMode,
     process_id: i32,
     secret_key: i32,
@@ -161,6 +161,7 @@ pub struct Client {
 impl Client {
     pub(crate) fn new(
         sender: mpsc::UnboundedSender<Request>,
+        socket_config: SocketConfig,
         ssl_mode: SslMode,
         process_id: i32,
         secret_key: i32,
@@ -172,7 +173,7 @@ impl Client {
                 buffer: Default::default(),
             }),
 
-            socket_config: None,
+            socket_config,
             ssl_mode,
             process_id,
             secret_key,
@@ -188,10 +189,6 @@ impl Client {
         &self.inner
     }
 
-    pub(crate) fn set_socket_config(&mut self, socket_config: SocketConfig) {
-        self.socket_config = Some(socket_config);
-    }
-
     /// Creates a new prepared statement.
     ///
     /// Prepared statements can be executed repeatedly, and may contain query parameters (indicated by `$1`, `$2`, etc),
@@ -412,7 +409,7 @@ impl Client {
     /// connection associated with this client.
     pub fn cancel_token(&self) -> CancelToken {
         CancelToken {
-            socket_config: self.socket_config.clone(),
+            socket_config: Some(self.socket_config.clone()),
             ssl_mode: self.ssl_mode,
             process_id: self.process_id,
             secret_key: self.secret_key,
diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs
index 969c20ba47e2..26124b38ef8f 100644
--- a/libs/proxy/tokio-postgres2/src/config.rs
+++ b/libs/proxy/tokio-postgres2/src/config.rs
@@ -2,6 +2,7 @@
 
 use crate::connect::connect;
 use crate::connect_raw::connect_raw;
+use crate::connect_raw::RawConnection;
 use crate::tls::MakeTlsConnect;
 use crate::tls::TlsConnect;
 use crate::{Client, Connection, Error};
@@ -485,14 +486,11 @@ impl Config {
         connect(tls, self).await
     }
 
-    /// Connects to a PostgreSQL database over an arbitrary stream.
-    ///
-    /// All of the settings other than `user`, `password`, `dbname`, `options`, and `application_name` name are ignored.
     pub async fn connect_raw<S, T>(
         &self,
         stream: S,
         tls: T,
-    ) -> Result<(Client, Connection<S, T::Stream>), Error>
+    ) -> Result<RawConnection<S, T::Stream>, Error>
     where
         S: AsyncRead + AsyncWrite + Unpin,
         T: TlsConnect<S>,
diff --git a/libs/proxy/tokio-postgres2/src/connect.rs b/libs/proxy/tokio-postgres2/src/connect.rs
index 7517fe0cdeb9..98067d91f942 100644
--- a/libs/proxy/tokio-postgres2/src/connect.rs
+++ b/libs/proxy/tokio-postgres2/src/connect.rs
@@ -1,13 +1,16 @@
 use crate::client::SocketConfig;
+use crate::codec::BackendMessage;
 use crate::config::{Host, TargetSessionAttrs};
 use crate::connect_raw::connect_raw;
 use crate::connect_socket::connect_socket;
 use crate::tls::{MakeTlsConnect, TlsConnect};
-use crate::{Client, Config, Connection, Error, SimpleQueryMessage};
+use crate::{Client, Config, Connection, Error, RawConnection, SimpleQueryMessage};
 use futures_util::{future, pin_mut, Future, FutureExt, Stream};
+use postgres_protocol2::message::backend::Message;
 use std::io;
 use std::task::Poll;
 use tokio::net::TcpStream;
+use tokio::sync::mpsc;
 
 pub async fn connect<T>(
     mut tls: T,
@@ -60,7 +63,36 @@ where
     T: TlsConnect<TcpStream>,
 {
     let socket = connect_socket(host, port, config.connect_timeout).await?;
-    let (mut client, mut connection) = connect_raw(socket, tls, config).await?;
+    let RawConnection {
+        stream,
+        parameters,
+        delayed_notice,
+        process_id,
+        secret_key,
+    } = connect_raw(socket, tls, config).await?;
+
+    let socket_config = SocketConfig {
+        host: host.clone(),
+        port,
+        connect_timeout: config.connect_timeout,
+    };
+
+    let (sender, receiver) = mpsc::unbounded_channel();
+    let client = Client::new(
+        sender,
+        socket_config,
+        config.ssl_mode,
+        process_id,
+        secret_key,
+    );
+
+    // delayed notices are always sent as "Async" messages.
+    let delayed = delayed_notice
+        .into_iter()
+        .map(|m| BackendMessage::Async(Message::NoticeResponse(m)))
+        .collect();
+
+    let mut connection = Connection::new(stream, delayed, parameters, receiver);
 
     if let TargetSessionAttrs::ReadWrite = config.target_session_attrs {
         let rows = client.simple_query_raw("SHOW transaction_read_only");
@@ -102,11 +134,5 @@ where
         }
     }
 
-    client.set_socket_config(SocketConfig {
-        host: host.clone(),
-        port,
-        connect_timeout: config.connect_timeout,
-    });
-
     Ok((client, connection))
 }
diff --git a/libs/proxy/tokio-postgres2/src/connect_raw.rs b/libs/proxy/tokio-postgres2/src/connect_raw.rs
index 80677af969f6..9c6f1a255200 100644
--- a/libs/proxy/tokio-postgres2/src/connect_raw.rs
+++ b/libs/proxy/tokio-postgres2/src/connect_raw.rs
@@ -3,27 +3,26 @@ use crate::config::{self, AuthKeys, Config, ReplicationMode};
 use crate::connect_tls::connect_tls;
 use crate::maybe_tls_stream::MaybeTlsStream;
 use crate::tls::{TlsConnect, TlsStream};
-use crate::{Client, Connection, Error};
+use crate::Error;
 use bytes::BytesMut;
 use fallible_iterator::FallibleIterator;
 use futures_util::{ready, Sink, SinkExt, Stream, TryStreamExt};
 use postgres_protocol2::authentication;
 use postgres_protocol2::authentication::sasl;
 use postgres_protocol2::authentication::sasl::ScramSha256;
-use postgres_protocol2::message::backend::{AuthenticationSaslBody, Message};
+use postgres_protocol2::message::backend::{AuthenticationSaslBody, Message, NoticeResponseBody};
 use postgres_protocol2::message::frontend;
-use std::collections::{HashMap, VecDeque};
+use std::collections::HashMap;
 use std::io;
 use std::pin::Pin;
 use std::task::{Context, Poll};
 use tokio::io::{AsyncRead, AsyncWrite};
-use tokio::sync::mpsc;
 use tokio_util::codec::Framed;
 
 pub struct StartupStream<S, T> {
     inner: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
     buf: BackendMessages,
-    delayed: VecDeque<BackendMessage>,
+    delayed_notice: Vec<NoticeResponseBody>,
 }
 
 impl<S, T> Sink<FrontendMessage> for StartupStream<S, T>
@@ -78,11 +77,19 @@ where
     }
 }
 
+pub struct RawConnection<S, T> {
+    pub stream: Framed<MaybeTlsStream<S, T>, PostgresCodec>,
+    pub parameters: HashMap<String, String>,
+    pub delayed_notice: Vec<NoticeResponseBody>,
+    pub process_id: i32,
+    pub secret_key: i32,
+}
+
 pub async fn connect_raw<S, T>(
     stream: S,
     tls: T,
     config: &Config,
-) -> Result<(Client, Connection<S, T::Stream>), Error>
+) -> Result<RawConnection<S, T::Stream>, Error>
 where
     S: AsyncRead + AsyncWrite + Unpin,
     T: TlsConnect<S>,
@@ -97,18 +104,20 @@ where
             },
         ),
         buf: BackendMessages::empty(),
-        delayed: VecDeque::new(),
+        delayed_notice: Vec::new(),
     };
 
     startup(&mut stream, config).await?;
     authenticate(&mut stream, config).await?;
     let (process_id, secret_key, parameters) = read_info(&mut stream).await?;
 
-    let (sender, receiver) = mpsc::unbounded_channel();
-    let client = Client::new(sender, config.ssl_mode, process_id, secret_key);
-    let connection = Connection::new(stream.inner, stream.delayed, parameters, receiver);
-
-    Ok((client, connection))
+    Ok(RawConnection {
+        stream: stream.inner,
+        parameters,
+        delayed_notice: stream.delayed_notice,
+        process_id,
+        secret_key,
+    })
 }
 
 async fn startup<S, T>(stream: &mut StartupStream<S, T>, config: &Config) -> Result<(), Error>
@@ -347,9 +356,7 @@ where
                     body.value().map_err(Error::parse)?.to_string(),
                 );
             }
-            Some(msg @ Message::NoticeResponse(_)) => {
-                stream.delayed.push_back(BackendMessage::Async(msg))
-            }
+            Some(Message::NoticeResponse(body)) => stream.delayed_notice.push(body),
             Some(Message::ReadyForQuery(_)) => return Ok((process_id, secret_key, parameters)),
             Some(Message::ErrorResponse(body)) => return Err(Error::db(body)),
             Some(_) => return Err(Error::unexpected_message()),
diff --git a/libs/proxy/tokio-postgres2/src/lib.rs b/libs/proxy/tokio-postgres2/src/lib.rs
index 72ba8172b28e..57c639a7de51 100644
--- a/libs/proxy/tokio-postgres2/src/lib.rs
+++ b/libs/proxy/tokio-postgres2/src/lib.rs
@@ -1,9 +1,10 @@
 //! An asynchronous, pipelined, PostgreSQL client.
-#![warn(rust_2018_idioms, clippy::all, missing_docs)]
+#![warn(rust_2018_idioms, clippy::all)]
 
 pub use crate::cancel_token::CancelToken;
-pub use crate::client::Client;
+pub use crate::client::{Client, SocketConfig};
 pub use crate::config::Config;
+pub use crate::connect_raw::RawConnection;
 pub use crate::connection::Connection;
 use crate::error::DbError;
 pub use crate::error::Error;
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 2abe88ac880f..b689b97a2100 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -6,6 +6,7 @@ use std::time::Duration;
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
+use postgres_protocol::message::backend::NoticeResponseBody;
 use pq_proto::StartupMessageParams;
 use rustls::client::danger::ServerCertVerifier;
 use rustls::crypto::ring;
@@ -13,6 +14,7 @@ use rustls::pki_types::InvalidDnsNameError;
 use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio_postgres::tls::MakeTlsConnect;
+use tokio_postgres::{CancelToken, RawConnection};
 use tracing::{debug, error, info, warn};
 
 use crate::auth::parse_endpoint_param;
@@ -277,6 +279,8 @@ pub(crate) struct PostgresConnection {
     pub(crate) cancel_closure: CancelClosure,
     /// Labels for proxy's metrics.
     pub(crate) aux: MetricsAuxInfo,
+    /// Notices received from compute after authenticating
+    pub(crate) delayed_notice: Vec<NoticeResponseBody>,
 
     _guage: NumDbConnectionsGuard<'static>,
 }
@@ -322,10 +326,19 @@ impl ConnCfg {
 
         // connect_raw() will not use TLS if sslmode is "disable"
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
-        let (client, connection) = self.0.connect_raw(stream, tls).await?;
+        let connection = self.0.connect_raw(stream, tls).await?;
         drop(pause);
-        tracing::Span::current().record("pid", tracing::field::display(client.get_process_id()));
-        let stream = connection.stream.into_inner();
+
+        let RawConnection {
+            stream,
+            parameters,
+            delayed_notice,
+            process_id,
+            secret_key,
+        } = connection;
+
+        tracing::Span::current().record("pid", tracing::field::display(process_id));
+        let stream = stream.into_inner();
 
         // TODO: lots of useful info but maybe we can move it elsewhere (eg traces?)
         info!(
@@ -334,18 +347,23 @@ impl ConnCfg {
             self.0.get_ssl_mode()
         );
 
-        // This is very ugly but as of now there's no better way to
-        // extract the connection parameters from tokio-postgres' connection.
-        // TODO: solve this problem in a more elegant manner (e.g. the new library).
-        let params = connection.parameters;
-
         // NB: CancelToken is supposed to hold socket_addr, but we use connect_raw.
         // Yet another reason to rework the connection establishing code.
-        let cancel_closure = CancelClosure::new(socket_addr, client.cancel_token(), vec![]);
+        let cancel_closure = CancelClosure::new(
+            socket_addr,
+            CancelToken {
+                socket_config: None,
+                ssl_mode: self.0.get_ssl_mode(),
+                process_id,
+                secret_key,
+            },
+            vec![],
+        );
 
         let connection = PostgresConnection {
             stream,
-            params,
+            params: parameters,
+            delayed_notice,
             cancel_closure,
             aux,
             _guage: Metrics::get().proxy.db_connections.guard(ctx.protocol()),
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index 956036d29d2e..af97fb3d7159 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -384,11 +384,13 @@ pub(crate) async fn prepare_client_connection<P>(
     // The new token (cancel_key_data) will be sent to the client.
     let cancel_key_data = session.enable_query_cancellation(node.cancel_closure.clone());
 
+    // Forward all deferred notices to the client.
+    for notice in &node.delayed_notice {
+        stream.write_message_noflush(&Be::Raw(b'N', notice.as_bytes()))?;
+    }
+
     // Forward all postgres connection params to the client.
-    // Right now the implementation is very hacky and inefficent (ideally,
-    // we don't need an intermediate hashmap), but at least it should be correct.
     for (name, value) in &node.params {
-        // TODO: Theoretically, this could result in a big pile of params...
         stream.write_message_noflush(&Be::ParameterStatus {
             name: name.as_bytes(),
             value: value.as_bytes(),
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index 2c2c2964b6bb..15be6c9724e8 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -233,7 +233,7 @@ async fn handshake_tls() -> anyhow::Result<()> {
         generate_tls_config("generic-project-name.localhost", "localhost")?;
     let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth));
 
-    let (_client, _conn) = tokio_postgres::Config::new()
+    let _conn = tokio_postgres::Config::new()
         .user("john_doe")
         .dbname("earth")
         .ssl_mode(SslMode::Require)
@@ -249,7 +249,7 @@ async fn handshake_raw() -> anyhow::Result<()> {
 
     let proxy = tokio::spawn(dummy_proxy(client, None, NoAuth));
 
-    let (_client, _conn) = tokio_postgres::Config::new()
+    let _conn = tokio_postgres::Config::new()
         .user("john_doe")
         .dbname("earth")
         .options("project=generic-project-name")
@@ -296,7 +296,7 @@ async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> {
         Scram::new(password).await?,
     ));
 
-    let (_client, _conn) = tokio_postgres::Config::new()
+    let _conn = tokio_postgres::Config::new()
         .channel_binding(tokio_postgres::config::ChannelBinding::Require)
         .user("user")
         .dbname("db")
@@ -320,7 +320,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
         Scram::new("password").await?,
     ));
 
-    let (_client, _conn) = tokio_postgres::Config::new()
+    let _conn = tokio_postgres::Config::new()
         .channel_binding(tokio_postgres::config::ChannelBinding::Disable)
         .user("user")
         .dbname("db")

From c18716bb3fdb7044588b40a9a8dc0491ac82b4e4 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 2 Dec 2024 12:46:07 +0000
Subject: [PATCH 05/65] CI(replication-tests): fix notifications about
 replication-tests failures (#9950)

## Problem

`if: ${{ github.event.schedule }}` gets skipped if a previous step has
failed, but we want to run the step for both `success` and `failure`

## Summary of changes
- Add `!cancelled()` to notification step if-condition, to skip only
cancelled jobs
---
 .github/workflows/benchmarking.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index ea8fee80c220..7621d72f64eb 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -249,7 +249,7 @@ jobs:
 
     # Post both success and failure to the Slack channel
     - name: Post to a Slack channel
-      if: ${{ github.event.schedule }}
+      if: ${{ github.event.schedule && !cancelled() }}
       uses: slackapi/slack-github-action@v1
       with:
         channel-id: "C06T9AMNDQQ" # on-call-compute-staging-stream

From 1b605716362138fd415f14c84747bbe434ee05f9 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Mon, 2 Dec 2024 16:38:12 +0100
Subject: [PATCH 06/65] proxy: Create Elasticache credentials provider lazily
 (#9967)

## Problem

The credentials providers tries to connect to AWS STS even when we use
plain Redis connections.

## Summary of changes

* Construct the CredentialsProvider only when needed ("irsa").
---
 proxy/src/bin/proxy.rs         | 49 +++++---------------------------
 proxy/src/redis/elasticache.rs | 51 ++++++++++++++++++++++++++++++----
 2 files changed, 53 insertions(+), 47 deletions(-)

diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index b772a987ee68..c929b97d78a5 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -3,14 +3,6 @@ use std::pin::pin;
 use std::sync::Arc;
 
 use anyhow::bail;
-use aws_config::environment::EnvironmentVariableCredentialsProvider;
-use aws_config::imds::credentials::ImdsCredentialsProvider;
-use aws_config::meta::credentials::CredentialsProviderChain;
-use aws_config::meta::region::RegionProviderChain;
-use aws_config::profile::ProfileFileCredentialsProvider;
-use aws_config::provider_config::ProviderConfig;
-use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
-use aws_config::Region;
 use futures::future::Either;
 use proxy::auth::backend::jwt::JwkCache;
 use proxy::auth::backend::{AuthRateLimiter, ConsoleRedirectBackend, MaybeOwned};
@@ -314,39 +306,7 @@ async fn main() -> anyhow::Result<()> {
     };
     info!("Using region: {}", args.aws_region);
 
-    let region_provider =
-        RegionProviderChain::default_provider().or_else(Region::new(args.aws_region.clone()));
-    let provider_conf =
-        ProviderConfig::without_region().with_region(region_provider.region().await);
-    let aws_credentials_provider = {
-        // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
-        CredentialsProviderChain::first_try("env", EnvironmentVariableCredentialsProvider::new())
-            // uses "AWS_PROFILE" / `aws sso login --profile <profile>`
-            .or_else(
-                "profile-sso",
-                ProfileFileCredentialsProvider::builder()
-                    .configure(&provider_conf)
-                    .build(),
-            )
-            // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
-            // needed to access remote extensions bucket
-            .or_else(
-                "token",
-                WebIdentityTokenCredentialsProvider::builder()
-                    .configure(&provider_conf)
-                    .build(),
-            )
-            // uses imds v2
-            .or_else("imds", ImdsCredentialsProvider::builder().build())
-    };
-    let elasticache_credentials_provider = Arc::new(elasticache::CredentialsProvider::new(
-        elasticache::AWSIRSAConfig::new(
-            args.aws_region.clone(),
-            args.redis_cluster_name,
-            args.redis_user_id,
-        ),
-        aws_credentials_provider,
-    ));
+    // TODO: untangle the config args
     let regional_redis_client = match (args.redis_auth_type.as_str(), &args.redis_notifications) {
         ("plain", redis_url) => match redis_url {
             None => {
@@ -361,7 +321,12 @@ async fn main() -> anyhow::Result<()> {
                 ConnectionWithCredentialsProvider::new_with_credentials_provider(
                     host.to_string(),
                     port,
-                    elasticache_credentials_provider.clone(),
+                    elasticache::CredentialsProvider::new(
+                        args.aws_region,
+                        args.redis_cluster_name,
+                        args.redis_user_id,
+                    )
+                    .await,
                 ),
             ),
             (None, None) => {
diff --git a/proxy/src/redis/elasticache.rs b/proxy/src/redis/elasticache.rs
index d118c8f4128c..bf6dde933285 100644
--- a/proxy/src/redis/elasticache.rs
+++ b/proxy/src/redis/elasticache.rs
@@ -1,6 +1,14 @@
+use std::sync::Arc;
 use std::time::{Duration, SystemTime};
 
+use aws_config::environment::EnvironmentVariableCredentialsProvider;
+use aws_config::imds::credentials::ImdsCredentialsProvider;
 use aws_config::meta::credentials::CredentialsProviderChain;
+use aws_config::meta::region::RegionProviderChain;
+use aws_config::profile::ProfileFileCredentialsProvider;
+use aws_config::provider_config::ProviderConfig;
+use aws_config::web_identity_token::WebIdentityTokenCredentialsProvider;
+use aws_config::Region;
 use aws_sdk_iam::config::ProvideCredentials;
 use aws_sigv4::http_request::{
     self, SignableBody, SignableRequest, SignatureLocation, SigningSettings,
@@ -45,12 +53,45 @@ pub struct CredentialsProvider {
 }
 
 impl CredentialsProvider {
-    pub fn new(config: AWSIRSAConfig, credentials_provider: CredentialsProviderChain) -> Self {
-        CredentialsProvider {
-            config,
-            credentials_provider,
-        }
+    pub async fn new(
+        aws_region: String,
+        redis_cluster_name: Option<String>,
+        redis_user_id: Option<String>,
+    ) -> Arc<CredentialsProvider> {
+        let region_provider =
+            RegionProviderChain::default_provider().or_else(Region::new(aws_region.clone()));
+        let provider_conf =
+            ProviderConfig::without_region().with_region(region_provider.region().await);
+        let aws_credentials_provider = {
+            // uses "AWS_ACCESS_KEY_ID", "AWS_SECRET_ACCESS_KEY"
+            CredentialsProviderChain::first_try(
+                "env",
+                EnvironmentVariableCredentialsProvider::new(),
+            )
+            // uses "AWS_PROFILE" / `aws sso login --profile <profile>`
+            .or_else(
+                "profile-sso",
+                ProfileFileCredentialsProvider::builder()
+                    .configure(&provider_conf)
+                    .build(),
+            )
+            // uses "AWS_WEB_IDENTITY_TOKEN_FILE", "AWS_ROLE_ARN", "AWS_ROLE_SESSION_NAME"
+            // needed to access remote extensions bucket
+            .or_else(
+                "token",
+                WebIdentityTokenCredentialsProvider::builder()
+                    .configure(&provider_conf)
+                    .build(),
+            )
+            // uses imds v2
+            .or_else("imds", ImdsCredentialsProvider::builder().build())
+        };
+        Arc::new(CredentialsProvider {
+            config: AWSIRSAConfig::new(aws_region, redis_cluster_name, redis_user_id),
+            credentials_provider: aws_credentials_provider,
+        })
     }
+
     pub(crate) async fn provide_credentials(&self) -> anyhow::Result<(String, String)> {
         let aws_credentials = self
             .credentials_provider

From fa909c27fc23be35e889b591a4eae028bd43434d Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Mon, 2 Dec 2024 19:10:44 +0300
Subject: [PATCH 07/65] Update consensus protocol spec (#9607)

The spec was written for the buggy protocol which we had before the one
more similar to Raft was implemented. Update the spec with what we
currently have.

ref https://github.com/neondatabase/neon/issues/8699
---
 safekeeper/spec/.gitignore                    |    3 +
 safekeeper/spec/MCProposerAcceptorStatic.tla  |   31 +
 safekeeper/spec/ProposerAcceptorConsensus.cfg |   34 -
 safekeeper/spec/ProposerAcceptorConsensus.tla |  363 ----
 safekeeper/spec/ProposerAcceptorStatic.tla    |  449 +++++
 safekeeper/spec/modelcheck.sh                 |   49 +
 .../MCProposerAcceptorStatic_p2_a3_t2_l2.cfg  |   19 +
 .../MCProposerAcceptorStatic_p2_a3_t3_l2.cfg  |   19 +
 .../MCProposerAcceptorStatic_p2_a3_t3_l3.cfg  |   17 +
 .../MCProposerAcceptorStatic_p2_a3_t4_l4.cfg  |   17 +
 .../MCProposerAcceptorStatic_p2_a5_t2_l2.cfg  |   16 +
 .../MCProposerAcceptorStatic_p2_a5_t3_l3.cfg  |   16 +
 .../MCProposerAcceptorStatic_p2_a5_t4_l3.cfg  |   16 +
 safekeeper/spec/readme.md                     |   12 +
 ...c_p2_a3_t2_l2.cfg-2024-11-06--13-44-17.log |   63 +
 ...c_p2_a3_t3_l2.cfg-2024-11-15--09-09-58.log |   69 +
 ...c_p2_a3_t3_l3.cfg-2024-11-06--13-03-51.log |   72 +
 ...c_p2_a3_t4_l4.cfg-2024-11-06--14-20-25.log | 1466 +++++++++++++++++
 ...c_p2_a3_t4_l4.cfg-2024-11-06--15-30-45.log | 1374 +++++++++++++++
 ...c_p2_a5_t2_l2.cfg-2024-11-06--12-09-32.log |   89 +
 20 files changed, 3797 insertions(+), 397 deletions(-)
 create mode 100644 safekeeper/spec/.gitignore
 create mode 100644 safekeeper/spec/MCProposerAcceptorStatic.tla
 delete mode 100644 safekeeper/spec/ProposerAcceptorConsensus.cfg
 delete mode 100644 safekeeper/spec/ProposerAcceptorConsensus.tla
 create mode 100644 safekeeper/spec/ProposerAcceptorStatic.tla
 create mode 100755 safekeeper/spec/modelcheck.sh
 create mode 100644 safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t2_l2.cfg
 create mode 100644 safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l2.cfg
 create mode 100644 safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg
 create mode 100644 safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t4_l4.cfg
 create mode 100644 safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t2_l2.cfg
 create mode 100644 safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t3_l3.cfg
 create mode 100644 safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t4_l3.cfg
 create mode 100644 safekeeper/spec/readme.md
 create mode 100644 safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t2_l2.cfg-2024-11-06--13-44-17.log
 create mode 100644 safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l2.cfg-2024-11-15--09-09-58.log
 create mode 100644 safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l3.cfg-2024-11-06--13-03-51.log
 create mode 100644 safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--14-20-25.log
 create mode 100644 safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--15-30-45.log
 create mode 100644 safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a5_t2_l2.cfg-2024-11-06--12-09-32.log

diff --git a/safekeeper/spec/.gitignore b/safekeeper/spec/.gitignore
new file mode 100644
index 000000000000..7233153039a6
--- /dev/null
+++ b/safekeeper/spec/.gitignore
@@ -0,0 +1,3 @@
+*TTrace*
+*.toolbox/
+states/
diff --git a/safekeeper/spec/MCProposerAcceptorStatic.tla b/safekeeper/spec/MCProposerAcceptorStatic.tla
new file mode 100644
index 000000000000..be3d99c6976d
--- /dev/null
+++ b/safekeeper/spec/MCProposerAcceptorStatic.tla
@@ -0,0 +1,31 @@
+---- MODULE MCProposerAcceptorStatic ----
+EXTENDS TLC, ProposerAcceptorStatic
+
+\* Augments the spec with model checking constraints.
+
+\* For model checking.
+CONSTANTS
+  max_entries, \* model constraint: max log entries acceptor/proposer can hold
+  max_term \* model constraint: max allowed term
+
+ASSUME max_entries \in Nat /\ max_term \in Nat
+
+\* Model space constraint.
+StateConstraint == \A p \in proposers:
+                    /\ prop_state[p].term <= max_term
+                    /\ Len(prop_state[p].wal) <= max_entries
+\* Sets of proposers and acceptors are symmetric because we don't take any
+\* actions depending on some concrete proposer/acceptor (like IF p = p1 THEN
+\* ...)
+ProposerAcceptorSymmetry == Permutations(proposers) \union Permutations(acceptors)
+
+\* enforce order of the vars in the error trace with ALIAS
+\* Note that ALIAS is supported only since version 1.8.0 which is pre-release
+\* as of writing this.
+Alias == [
+           prop_state |-> prop_state,
+           acc_state |-> acc_state,
+           committed |-> committed
+         ]
+
+====
diff --git a/safekeeper/spec/ProposerAcceptorConsensus.cfg b/safekeeper/spec/ProposerAcceptorConsensus.cfg
deleted file mode 100644
index 989c86e47d75..000000000000
--- a/safekeeper/spec/ProposerAcceptorConsensus.cfg
+++ /dev/null
@@ -1,34 +0,0 @@
-\* MV CONSTANT declarations
-CONSTANT NULL = NULL
-CONSTANTS
-p1 = p1
-p2 = p2
-p3 = p3
-a1 = a1
-a2 = a2
-a3 = a3
-\* MV CONSTANT definitions
-CONSTANT
-proposers = {p1, p2}
-acceptors = {a1, a2, a3}
-\* SYMMETRY definition
-SYMMETRY perms
-\* CONSTANT definitions
-CONSTANT
-max_term = 3
-CONSTANT
-max_entries = 3
-\* INIT definition
-INIT
-Init
-\* NEXT definition
-NEXT
-Next
-\* INVARIANT definition
-INVARIANT
-TypeOk
-ElectionSafety
-LogIsMonotonic
-LogSafety
-CommittedNotOverwritten
-CHECK_DEADLOCK FALSE
\ No newline at end of file
diff --git a/safekeeper/spec/ProposerAcceptorConsensus.tla b/safekeeper/spec/ProposerAcceptorConsensus.tla
deleted file mode 100644
index e5f0bb270f08..000000000000
--- a/safekeeper/spec/ProposerAcceptorConsensus.tla
+++ /dev/null
@@ -1,363 +0,0 @@
----- MODULE ProposerAcceptorConsensus ----
-
-\* Differences from current implementation:
-\* - unified not-globally-unique epoch & term (node_id)
-\* Simplifications:
-\* - instant message delivery
-\* - feedback is not modeled separately, commit_lsn is updated directly
-
-EXTENDS Integers, Sequences, FiniteSets, TLC
-
-VARIABLES
-  prop_state, \* prop_state[p] is state of proposer p
-  acc_state, \* acc_state[a] is state of acceptor a
-  commit_lsns \* map of acceptor -> commit_lsn
-
-CONSTANT
-  acceptors,
-  proposers,
-  max_entries, \* model constraint: max log entries acceptor/proposer can hold
-  max_term \* model constraint: max allowed term
-
-CONSTANT NULL
-
-ASSUME max_entries \in Nat /\ max_term \in Nat
-
-\* For specifying symmetry set in manual cfg file, see
-\* https://github.com/tlaplus/tlaplus/issues/404
-perms == Permutations(proposers) \union Permutations(acceptors)
-
-\********************************************************************************
-\* Helpers
-\********************************************************************************
-
-Maximum(S) ==
-  (*************************************************************************)
-  (* If S is a set of numbers, then this define Maximum(S) to be the       *)
-  (* maximum of those numbers, or -1 if S is empty.                        *)
-  (*************************************************************************)
-  IF S = {} THEN -1
-            ELSE CHOOSE n \in S : \A m \in S : n \geq m
-
-\* minimum of numbers in the set, error if set is empty
-Minimum(S) ==
-  CHOOSE min \in S : \A n \in S : min <= n
-
-\* Min of two numbers
-Min(a, b) == IF a < b THEN a ELSE b
-
-\* Set of values of function f. XXX is there a such builtin?
-FValues(f) == {f[a] : a \in DOMAIN f}
-
-\* Sort of 0 for functions
-EmptyF == [x \in {} |-> 42]
-IsEmptyF(f) == DOMAIN f = {}
-
-\* Next entry proposer p will push to acceptor a or NULL.
-NextEntry(p, a) ==
-  IF Len(prop_state[p].wal) >= prop_state[p].next_send_lsn[a] THEN
-    CHOOSE r \in FValues(prop_state[p].wal) : r.lsn = prop_state[p].next_send_lsn[a]
-  ELSE
-    NULL
-
-
-\*****************
-
-NumAccs == Cardinality(acceptors)
-
-\* does acc_set form the quorum?
-Quorum(acc_set) == Cardinality(acc_set) >= (NumAccs \div 2 + 1)
-\* all quorums of acceptors
-Quorums == {subset \in SUBSET acceptors: Quorum(subset)}
-
-\* flush_lsn of acceptor a.
-FlushLsn(a) == Len(acc_state[a].wal)
-
-
-\********************************************************************************
-\* Type assertion
-\********************************************************************************
-\* Defining sets of all possible tuples and using them in TypeOk in usual
-\* all-tuples constructor is not practical because such definitions force
-\* TLC to enumerate them, while they are are horribly enormous
-\* (TLC screams "Attempted to construct a set with too many elements").
-\* So instead check types manually.
-TypeOk ==
-    /\ \A p \in proposers:
-      /\ DOMAIN prop_state[p] = {"state", "term", "votes", "donor_epoch", "vcl", "wal", "next_send_lsn"}
-      \* in campaign proposer sends RequestVote and waits for acks;
-      \* in leader he is elected
-      /\ prop_state[p].state \in {"campaign", "leader"}
-      \* 0..max_term should be actually Nat in the unbounded model, but TLC won't
-      \* swallow it
-      /\ prop_state[p].term \in 0..max_term
-      \* votes received
-      /\ \A voter \in DOMAIN prop_state[p].votes:
-         /\ voter \in acceptors
-         /\ prop_state[p].votes[voter] \in [epoch: 0..max_term, flush_lsn: 0..max_entries]
-      /\ prop_state[p].donor_epoch \in 0..max_term
-      \* wal is sequence of just <lsn, epoch of author> records
-      /\ \A i \in DOMAIN prop_state[p].wal:
-           prop_state[p].wal[i] \in [lsn: 1..max_entries, epoch: 1..max_term]
-      \* Following implementation, we skew the original Aurora meaning of this;
-      \* here it is lsn of highest definitely committed record as set by proposer
-      \* when it is elected; it doesn't change since then
-      /\ prop_state[p].vcl \in 0..max_entries
-      \* map of acceptor -> next lsn to send
-      /\ \A a \in DOMAIN prop_state[p].next_send_lsn:
-         /\ a \in acceptors
-         /\ prop_state[p].next_send_lsn[a] \in 1..(max_entries + 1)
-    /\ \A a \in acceptors:
-      /\ DOMAIN acc_state[a] = {"term", "epoch", "wal"}
-      /\ acc_state[a].term \in 0..max_term
-      /\ acc_state[a].epoch \in 0..max_term
-      /\ \A i \in DOMAIN acc_state[a].wal:
-           acc_state[a].wal[i] \in [lsn: 1..max_entries, epoch: 1..max_term]
-    /\ \A a \in DOMAIN commit_lsns:
-      /\ a \in acceptors
-      /\ commit_lsns[a] \in 0..max_entries
-
-\********************************************************************************
-\* Initial
-\********************************************************************************
-
-Init ==
-  /\ prop_state = [p \in proposers |-> [
-                      state |-> "campaign",
-                      term |-> 1,
-                      votes |-> EmptyF,
-                      donor_epoch |-> 0,
-                      vcl |-> 0,
-                      wal |-> << >>,
-                      next_send_lsn |-> EmptyF
-                  ]]
-  /\ acc_state = [a \in acceptors |-> [
-                    \* there will be no leader in this term, 1 is the first real
-                    term |-> 0,
-                    epoch |-> 0,
-                    wal |-> << >>
-                 ]]
-  /\ commit_lsns = [a \in acceptors |-> 0]
-
-
-\********************************************************************************
-\* Actions
-\********************************************************************************
-
-\* Proposer loses all state.
-\* For simplicity (and to reduct state space), we assume it immediately gets
-\* current state from quorum q of acceptors determining the term he will request
-\* to vote for.
-RestartProposer(p, q) ==
-  /\ Quorum(q)
-  /\ LET
-       new_term == Maximum({acc_state[a].term : a \in q}) + 1
-     IN
-       /\ new_term <= max_term
-       /\ prop_state' = [prop_state EXCEPT ![p].state = "campaign",
-                                           ![p].term = new_term,
-                                           ![p].votes = EmptyF,
-                                           ![p].donor_epoch = 0,
-                                           ![p].vcl = 0,
-                                           ![p].wal = << >>,
-                                           ![p].next_send_lsn = EmptyF]
-       /\ UNCHANGED <<acc_state, commit_lsns>>
-
-\* Acceptor a immediately votes for proposer p.
-Vote(p, a) ==
- /\ prop_state[p].state = "campaign"
- /\ acc_state[a].term < prop_state[p].term \* main voting condition
- /\ acc_state' = [acc_state EXCEPT ![a].term = prop_state[p].term]
- /\ LET
-      vote == [epoch |-> acc_state[a].epoch, flush_lsn |-> FlushLsn(a)]
-    IN
-      prop_state' = [prop_state EXCEPT ![p].votes = prop_state[p].votes @@ (a :> vote)]
- /\ UNCHANGED <<commit_lsns>>
-
-
-\* Proposer p gets elected.
-BecomeLeader(p) ==
-  /\ prop_state[p].state = "campaign"
-  /\ Quorum(DOMAIN prop_state[p].votes)
-  /\ LET
-       max_epoch == Maximum({v.epoch : v \in FValues(prop_state[p].votes)})
-       max_epoch_votes == {v \in FValues(prop_state[p].votes) : v.epoch = max_epoch}
-       donor == CHOOSE dv \in DOMAIN prop_state[p].votes :
-                     /\ prop_state[p].votes[dv].epoch = max_epoch
-                     /\ \A v \in max_epoch_votes:
-                       prop_state[p].votes[dv].flush_lsn >= v.flush_lsn
-       max_vote == prop_state[p].votes[donor]
-       \* Establish lsn to stream from for voters.
-       \* At some point it seemed like we can regard log as correct and only
-       \* append to it if has in the max_epoch, however TLC showed that's not
-       \* the case; we must always stream since first not matching record.
-       next_send_lsn == [voter \in DOMAIN prop_state[p].votes |-> 1]
-     IN
-          \* we fetch log from the most advanced node (this is separate
-          \* roundtrip), make sure node is still on one term with us
-       /\ acc_state[donor].term = prop_state[p].term
-       /\ prop_state' = [prop_state EXCEPT ![p].state = "leader",
-                                           \* fetch the log from donor
-                                           ![p].wal = acc_state[donor].wal,
-                                           ![p].donor_epoch = max_epoch,
-                                           ![p].vcl = max_vote.flush_lsn,
-                                           ![p].next_send_lsn = next_send_lsn]
-       /\ UNCHANGED <<acc_state, commit_lsns>>
-
-
-\* acceptor a learns about elected proposer p's term.
-UpdateTerm(p, a) ==
-  /\ prop_state[p].state = "leader"
-  /\ acc_state[a].term < prop_state[p].term
-  /\ acc_state' = [acc_state EXCEPT ![a].term = prop_state[p].term]
-  /\ UNCHANGED <<prop_state, commit_lsns>>
-
-
-\* Acceptor a which didn't participate in voting connects to elected proposer p
-\* and p sets the streaming point
-HandshakeWithLeader(p, a) ==
-  /\ prop_state[p].state = "leader"
-  /\ acc_state[a].term = prop_state[p].term
-  /\ a \notin DOMAIN prop_state[p].next_send_lsn
-  /\ LET
-       next_send_lsn == prop_state[p].next_send_lsn @@ (a :> 1)
-     IN
-       prop_state' = [prop_state EXCEPT ![p].next_send_lsn = next_send_lsn]
-  /\ UNCHANGED <<acc_state, commit_lsns>>
-
-
-\* Append new log entry to elected proposer
-NewEntry(p) ==
-  /\ prop_state[p].state = "leader"
-  /\ Len(prop_state[p].wal) < max_entries \* model constraint
-  /\ LET
-       new_lsn == IF Len(prop_state[p].wal) = 0 THEN
-                    prop_state[p].vcl + 1
-                  ELSE
-                    \* lsn of last record + 1
-                    prop_state[p].wal[Len(prop_state[p].wal)].lsn + 1
-       new_entry == [lsn |-> new_lsn, epoch |-> prop_state[p].term]
-     IN
-       /\ prop_state' = [prop_state EXCEPT ![p].wal = Append(prop_state[p].wal, new_entry)]
-       /\ UNCHANGED <<acc_state, commit_lsns>>
-
-
-\* Write entry new_e to log wal, rolling back all higher entries if e is different.
-\* If bump_epoch is TRUE, it means we get record with lsn=vcl and going to update
-\* the epoch. Truncate log in this case as well, as we might have correct <= vcl
-\* part and some outdated entries behind it which we want to purge before
-\* declaring us as recovered. Another way to accomplish this (in previous commit)
-\* is wait for first-entry-from-new-epoch before bumping it.
-WriteEntry(wal, new_e, bump_epoch) ==
-  (new_e.lsn :> new_e) @@
-  \* If wal has entry with such lsn and it is different, truncate all higher log.
-  IF \/ (new_e.lsn \in DOMAIN wal /\ wal[new_e.lsn] /= new_e)
-     \/ bump_epoch THEN
-    SelectSeq(wal, LAMBDA e: e.lsn < new_e.lsn)
-  ELSE
-    wal
-
-
-\* Try to transfer entry from elected proposer p to acceptor a
-TransferEntry(p, a) ==
-  /\ prop_state[p].state = "leader"
-  /\ prop_state[p].term = acc_state[a].term
-  /\ a \in DOMAIN prop_state[p].next_send_lsn
-  /\ LET
-       next_e == NextEntry(p, a)
-     IN
-       /\ next_e /= NULL
-       /\ LET
-            \* Consider bumping epoch if getting this entry recovers the acceptor,
-            \* that is, we reach first record behind VCL.
-            new_epoch ==
-              IF /\ acc_state[a].epoch < prop_state[p].term
-                 /\ next_e.lsn >= prop_state[p].vcl
-              THEN
-                prop_state[p].term
-              ELSE
-                acc_state[a].epoch
-            \* Also check whether this entry allows to advance commit_lsn and
-            \* if so, bump it where possible. Modeling this as separate action
-            \* significantly bloats the space (5m vs 15m on max_entries=3 max_term=3,
-            \* so act immediately.
-            entry_owners == {o \in acceptors:
-                               /\ o /= a
-                               \* only recovered acceptors advance commit_lsn
-                               /\ acc_state[o].epoch = prop_state[p].term
-                               /\ next_e \in FValues(acc_state[o].wal)} \cup {a}
-          IN
-            /\ acc_state' = [acc_state EXCEPT ![a].wal = WriteEntry(acc_state[a].wal, next_e, new_epoch /= acc_state[a].epoch),
-                                              ![a].epoch = new_epoch]
-            /\ prop_state' = [prop_state EXCEPT ![p].next_send_lsn[a] =
-                                                  prop_state[p].next_send_lsn[a] + 1]
-            /\ commit_lsns' = IF /\ new_epoch = prop_state[p].term
-                                 /\ Quorum(entry_owners)
-                              THEN
-                                [acc \in acceptors |->
-                                   IF /\ acc \in entry_owners
-                                      /\ next_e.lsn > commit_lsns[acc]
-                                   THEN
-                                     next_e.lsn
-                                   ELSE
-                                       commit_lsns[acc]]
-                              ELSE
-                                commit_lsns
-
-
-\*******************************************************************************
-\* Final spec
-\*******************************************************************************
-
-Next ==
-  \/ \E q \in Quorums: \E p \in proposers: RestartProposer(p, q)
-  \/ \E p \in proposers: \E a \in acceptors: Vote(p, a)
-  \/ \E p \in proposers: BecomeLeader(p)
-  \/ \E p \in proposers: \E a \in acceptors: UpdateTerm(p, a)
-  \/ \E p \in proposers: \E a \in acceptors: HandshakeWithLeader(p, a)
-  \/ \E p \in proposers: NewEntry(p)
-  \/ \E p \in proposers: \E a \in acceptors: TransferEntry(p, a)
-
-Spec == Init /\ [][Next]_<<prop_state, acc_state, commit_lsns>>
-
-
-\********************************************************************************
-\* Invariants
-\********************************************************************************
-
-\* we don't track history, but this property is fairly convincing anyway
-ElectionSafety ==
-  \A p1, p2 \in proposers:
-    (/\ prop_state[p1].state = "leader"
-     /\ prop_state[p2].state = "leader"
-     /\ prop_state[p1].term = prop_state[p2].term) => (p1 = p2)
-
-LogIsMonotonic ==
-  \A a \in acceptors:
-    \A i \in DOMAIN acc_state[a].wal: \A j \in DOMAIN acc_state[a].wal:
-      (i > j) => (/\ acc_state[a].wal[i].lsn > acc_state[a].wal[j].lsn
-                  /\ acc_state[a].wal[i].epoch >= acc_state[a].wal[j].epoch)
-
-\* Main invariant: log under commit_lsn must match everywhere.
-LogSafety ==
-  \A a1 \in acceptors: \A a2 \in acceptors:
-    LET
-      common_len == Min(commit_lsns[a1], commit_lsns[a2])
-    IN
-      SubSeq(acc_state[a1].wal, 1, common_len) = SubSeq(acc_state[a2].wal, 1, common_len)
-
-\* Next record we are going to push to acceptor must never overwrite committed
-\* different record.
-CommittedNotOverwritten ==
-  \A p \in proposers: \A a \in acceptors:
-    (/\ prop_state[p].state = "leader"
-     /\ prop_state[p].term = acc_state[a].term
-     /\ a \in DOMAIN prop_state[p].next_send_lsn) =>
-       LET
-         next_e == NextEntry(p, a)
-       IN
-         (next_e /= NULL) =>
-          ((commit_lsns[a] >= next_e.lsn) => (acc_state[a].wal[next_e.lsn] = next_e))
-
-
-====
\ No newline at end of file
diff --git a/safekeeper/spec/ProposerAcceptorStatic.tla b/safekeeper/spec/ProposerAcceptorStatic.tla
new file mode 100644
index 000000000000..b2d2f005dba8
--- /dev/null
+++ b/safekeeper/spec/ProposerAcceptorStatic.tla
@@ -0,0 +1,449 @@
+---- MODULE ProposerAcceptorStatic ----
+
+(*
+  The protocol is very similar to Raft. The key differences are:
+  - Leaders (proposers) are separated from storage nodes (acceptors), which has
+    been already an established way to think about Paxos.
+  - We don't want to stamp each log record with term, so instead carry around
+    term histories which are sequences of <term, LSN where term begins> pairs.
+    As a bonus (and subtlety) this allows the proposer to commit entries from
+    previous terms without writing new records -- if acceptor's log is caught
+    up, update of term history on it updates last_log_term as well.
+*)
+
+\* Model simplifications:
+\* - Instant message delivery. Notably, ProposerElected message (TruncateWal action) is not
+\*   delayed, so we don't attempt to truncate WAL when the same wp already appended something
+\*   on the acceptor since common point had been calculated (this should be rejected).
+\* - old WAL is immediately copied to proposer on its election, without on-demand fetch later.
+
+\* Some ideas how to break it to play around to get a feeling:
+\* - replace Quorums with BadQuorums.
+\* - remove 'don't commit entries from previous terms separately' rule in
+\*   CommitEntries and observe figure 8 from the raft paper.
+\*   With p2a3t4l4 32 steps error was found in 1h on 80 cores.
+
+EXTENDS Integers, Sequences, FiniteSets, TLC
+
+VARIABLES
+  prop_state, \* prop_state[p] is state of proposer p
+  acc_state, \* acc_state[a] is state of acceptor a
+  committed, \* bag (set) of ever committed <<term, lsn>> entries
+  elected_history \* counter for elected terms, see TypeOk for details
+
+CONSTANT
+  acceptors,
+  proposers
+
+CONSTANT NULL
+
+\********************************************************************************
+\* Helpers
+\********************************************************************************
+
+Maximum(S) ==
+  (*************************************************************************)
+  (* If S is a set of numbers, then this define Maximum(S) to be the       *)
+  (* maximum of those numbers, or -1 if S is empty.                        *)
+  (*************************************************************************)
+  IF S = {} THEN -1 ELSE CHOOSE n \in S : \A m \in S : n \geq m
+
+\* minimum of numbers in the set, error if set is empty
+Minimum(S) == CHOOSE min \in S : \A n \in S : min <= n
+
+\* Min of two numbers
+Min(a, b) == IF a < b THEN a ELSE b
+
+\* Sort of 0 for functions
+EmptyF == [x \in {} |-> 42]
+IsEmptyF(f) == DOMAIN f = {}
+
+\* Set of values (image) of the function f. Apparently no such builtin.
+Range(f) == {f[x] : x \in DOMAIN f}
+
+\* If key k is in function f, map it using l, otherwise insert v. Returns the
+\* updated function.
+Upsert(f, k, v, l(_)) ==
+    LET new_val == IF k \in DOMAIN f THEN l(f[k]) ELSE v IN
+        (k :> new_val) @@ f
+
+\*****************
+
+NumAccs == Cardinality(acceptors)
+
+\* does acc_set form the quorum?
+Quorum(acc_set) == Cardinality(acc_set) >= (NumAccs \div 2 + 1)
+\* all quorums of acceptors
+Quorums == {subset \in SUBSET acceptors: Quorum(subset)}
+
+\* For substituting Quorums and seeing what happens.
+BadQuorum(acc_set) == Cardinality(acc_set) >= (NumAccs \div 2)
+BadQuorums == {subset \in SUBSET acceptors: BadQuorum(subset)}
+
+\* flushLsn (end of WAL, i.e. index of next entry) of acceptor a.
+FlushLsn(a) == Len(acc_state[a].wal) + 1
+
+\* Typedefs. Note that TLA+ Nat includes zero.
+Terms == Nat
+Lsns == Nat
+
+\********************************************************************************
+\* Type assertion
+\********************************************************************************
+\* Defining sets of all possible tuples and using them in TypeOk in usual
+\* all-tuples constructor is not practical because such definitions force
+\* TLC to enumerate them, while they are are horribly enormous
+\* (TLC screams "Attempted to construct a set with too many elements").
+\* So instead check types manually.
+
+
+\* Term history is a sequence of <term, LSN where term begins> pairs.
+IsTermHistory(th) ==
+    \A th_entry \in Range(th): th_entry.term \in Terms /\ th_entry.lsn \in Lsns
+
+IsWal(w) ==
+    \A i \in DOMAIN w:
+        /\ i \in Lsns
+        /\ w[i] \in Terms
+
+TypeOk ==
+    /\ \A p \in proposers:
+        \* '_' in field names hinders pretty printing
+        \* https://github.com/tlaplus/tlaplus/issues/1051
+        \* so use camel case.
+        /\ DOMAIN prop_state[p] = {"state", "term", "votes", "termHistory", "wal", "nextSendLsn"}
+        \* In campaign proposer sends RequestVote and waits for acks;
+        \* in leader he is elected.
+        /\ prop_state[p].state \in {"campaign", "leader"}
+        \* term for which it will campaign, or won term in leader state
+        /\ prop_state[p].term \in Terms
+        \* votes received
+        /\ \A voter \in DOMAIN prop_state[p].votes: voter \in acceptors
+        /\ \A vote \in Range(prop_state[p].votes):
+               /\ IsTermHistory(vote.termHistory)
+               /\ vote.flushLsn \in Lsns
+        \* Proposer's term history. Empty while proposer is in "campaign".
+        /\ IsTermHistory(prop_state[p].termHistory)
+        \* In the model we identify WAL entries only by <term, LSN> pairs
+        \* without additional unique id, which is enough for its purposes.
+        \* It means that with term history fully modeled wal becomes
+        \* redundant as it can be computed from term history + WAL length.
+        \* However, we still keep it here and at acceptors as explicit sequence
+        \* where index is LSN and value is the term to avoid artificial mapping to
+        \* figure out real entries. It shouldn't bloat model much because this
+        \* doesn't increase number of distinct states.
+        /\ IsWal(prop_state[p].wal)
+        \* Map of acceptor -> next lsn to send. It is set when truncate_wal is
+        \* done so sending entries is allowed only after that. In the impl TCP
+        \* ensures this ordering.
+        /\ \A a \in DOMAIN prop_state[p].nextSendLsn:
+               /\ a \in acceptors
+               /\ prop_state[p].nextSendLsn[a] \in Lsns
+    /\ \A a \in acceptors:
+           /\ DOMAIN acc_state[a] = {"term", "termHistory", "wal"}
+           /\ acc_state[a].term \in Terms
+           /\ IsTermHistory(acc_state[a].termHistory)
+           /\ IsWal(acc_state[a].wal)
+    /\ \A c \in committed:
+           /\ c.term \in Terms
+           /\ c.lsn \in Lsns
+    \* elected_history is a retrospective map of term -> number of times it was
+    \* elected, for use in ElectionSafetyFull invariant. For static spec it is
+    \* fairly convincing that it holds, but with membership change it is less
+    \* trivial. And as we identify log entries only with <term, lsn>, importance
+    \* of it is quite high as violation of log safety might go undetected if
+    \* election safety is violated. Note though that this is not always the
+    \* case, i.e. you can imagine (and TLC should find) schedule where log
+    \* safety violation is still detected because two leaders with the same term
+    \* commit histories which are different in previous terms, so it is not that
+    \* crucial. Plus if spec allows ElectionSafetyFull violation, likely
+    \* ElectionSafety will also be violated in some schedules. But neither it
+    \* should bloat the model too much.
+    /\ \A term \in DOMAIN elected_history:
+           /\ term \in Terms
+           /\ elected_history[term] \in Nat
+
+\********************************************************************************
+\* Initial
+\********************************************************************************
+
+Init ==
+    /\ prop_state = [p \in proposers |-> [
+                        state |-> "campaign",
+                        term |-> 1,
+                        votes |-> EmptyF,
+                        termHistory |-> << >>,
+                        wal |-> << >>,
+                        nextSendLsn |-> EmptyF
+                    ]]
+    /\ acc_state = [a \in acceptors |-> [
+                       \* There will be no leader in zero term, 1 is the first
+                       \* real.
+                       term |-> 0,
+                       \* Again, leader in term 0 doesn't exist, but we initialize
+                       \* term histories with it to always have common point in
+                       \* them. Lsn is 1 because TLA+ sequences are indexed from 1
+                       \* (we don't want to truncate WAL out of range).
+                       termHistory |-> << [term |-> 0, lsn |-> 1] >>,
+                       wal |-> << >>
+                   ]]
+    /\ committed = {}
+    /\ elected_history = EmptyF
+
+
+\********************************************************************************
+\* Actions
+\********************************************************************************
+
+\* Proposer loses all state.
+\* For simplicity (and to reduct state space), we assume it immediately gets
+\* current state from quorum q of acceptors determining the term he will request
+\* to vote for.
+RestartProposer(p, q) ==
+    /\ Quorum(q)
+    /\ LET new_term == Maximum({acc_state[a].term : a \in q}) + 1 IN
+           /\ prop_state' = [prop_state EXCEPT ![p].state = "campaign",
+                                               ![p].term = new_term,
+                                               ![p].votes = EmptyF,
+                                               ![p].termHistory = << >>,
+                                               ![p].wal = << >>,
+                                               ![p].nextSendLsn = EmptyF]
+           /\ UNCHANGED <<acc_state, committed, elected_history>>
+
+\* Term history of acceptor a's WAL: the one saved truncated to contain only <=
+\* local FlushLsn entries.
+AcceptorTermHistory(a) ==
+    SelectSeq(acc_state[a].termHistory, LAMBDA th_entry: th_entry.lsn <= FlushLsn(a))
+
+\* Acceptor a immediately votes for proposer p.
+Vote(p, a) ==
+    /\ prop_state[p].state = "campaign"
+    /\ acc_state[a].term < prop_state[p].term \* main voting condition
+    /\ acc_state' = [acc_state EXCEPT ![a].term = prop_state[p].term]
+    /\ LET
+           vote == [termHistory |-> AcceptorTermHistory(a), flushLsn |-> FlushLsn(a)]
+       IN
+           prop_state' = [prop_state EXCEPT ![p].votes = (a :> vote) @@ prop_state[p].votes]
+    /\ UNCHANGED <<committed, elected_history>>
+
+
+\* Get lastLogTerm from term history th.
+LastLogTerm(th) == th[Len(th)].term
+
+\* Proposer p gets elected.
+BecomeLeader(p) ==
+  /\ prop_state[p].state = "campaign"
+  /\ Quorum(DOMAIN prop_state[p].votes)
+  /\ LET
+         \* Find acceptor with the highest <last_log_term, lsn> vote.
+         max_vote_acc ==
+              CHOOSE a \in DOMAIN prop_state[p].votes:
+                  LET v == prop_state[p].votes[a]
+                  IN \A v2 \in Range(prop_state[p].votes):
+                         /\ LastLogTerm(v.termHistory) >= LastLogTerm(v2.termHistory)
+                         /\ (LastLogTerm(v.termHistory) = LastLogTerm(v2.termHistory) => v.flushLsn >= v2.flushLsn)
+         max_vote == prop_state[p].votes[max_vote_acc]
+         prop_th == Append(max_vote.termHistory, [term |-> prop_state[p].term, lsn |-> max_vote.flushLsn])
+     IN
+         \* We copy all log preceding proposer's term from the max vote node so
+         \* make sure it is still on one term with us. This is a model
+         \* simplification which can be removed, in impl we fetch WAL on demand
+         \* from safekeeper which has it later. Note though that in case of on
+         \* demand fetch we must check on donor not only term match, but that
+         \* truncate_wal had already been done (if it is not max_vote_acc).
+         /\ acc_state[max_vote_acc].term = prop_state[p].term
+         /\ prop_state' = [prop_state EXCEPT ![p].state = "leader",
+                                             ![p].termHistory = prop_th,
+                                             ![p].wal = acc_state[max_vote_acc].wal
+                          ]
+         /\ elected_history' = Upsert(elected_history, prop_state[p].term, 1, LAMBDA c: c + 1)
+         /\ UNCHANGED <<acc_state, committed>>
+
+
+\* Acceptor a learns about elected proposer p's term. In impl it matches to
+\* VoteRequest/VoteResponse exchange when leader is already elected and is not
+\* interested in the vote result.
+UpdateTerm(p, a) ==
+    /\ prop_state[p].state = "leader"
+    /\ acc_state[a].term < prop_state[p].term
+    /\ acc_state' = [acc_state EXCEPT ![a].term = prop_state[p].term]
+    /\ UNCHANGED <<prop_state, committed, elected_history>>
+
+\* Find highest common point (LSN of the first divergent record) in the logs of
+\* proposer p and acceptor a. Returns <term, lsn> of the highest common point.
+FindHighestCommonPoint(prop_th, acc_th, acc_flush_lsn) ==
+    LET
+        \* First find index of the highest common term.
+        \* It must exist because we initialize th with <0, 1>.
+        last_common_idx == Maximum({i \in 1..Min(Len(prop_th), Len(acc_th)): prop_th[i].term = acc_th[i].term})
+        last_common_term == prop_th[last_common_idx].term
+        \* Now find where it ends at both prop and acc and take min. End of term
+        \* is the start of the next unless it is the last one; there it is
+        \* flush_lsn in case of acceptor. In case of proposer it is the current
+        \* writing position, but it can't be less than flush_lsn, so we
+        \* take flush_lsn.
+        acc_common_term_end == IF last_common_idx = Len(acc_th) THEN acc_flush_lsn ELSE acc_th[last_common_idx + 1].lsn
+        prop_common_term_end == IF last_common_idx = Len(prop_th) THEN acc_flush_lsn ELSE prop_th[last_common_idx + 1].lsn
+    IN
+        [term |-> last_common_term, lsn |-> Min(acc_common_term_end, prop_common_term_end)]
+
+\* Elected proposer p immediately truncates WAL (and term history) of acceptor a
+\* before starting streaming. Establishes nextSendLsn for a.
+\*
+\* In impl this happens at each reconnection, here we also allow to do it multiple times.
+TruncateWal(p, a) ==
+    /\ prop_state[p].state = "leader"
+    /\ acc_state[a].term = prop_state[p].term
+    /\ LET
+           hcp == FindHighestCommonPoint(prop_state[p].termHistory, AcceptorTermHistory(a), FlushLsn(a))
+           next_send_lsn == (a :> hcp.lsn) @@ prop_state[p].nextSendLsn
+       IN
+           \* Acceptor persists full history immediately; reads adjust it to the
+           \* really existing wal with AcceptorTermHistory.
+           /\ acc_state' = [acc_state EXCEPT ![a].termHistory = prop_state[p].termHistory,
+                                             \* note: SubSeq is inclusive, hence -1.
+                                             ![a].wal = SubSeq(acc_state[a].wal, 1, hcp.lsn - 1)
+                           ]
+           /\ prop_state' = [prop_state EXCEPT ![p].nextSendLsn = next_send_lsn]
+           /\ UNCHANGED <<committed, elected_history>>
+
+\* Append new log entry to elected proposer
+NewEntry(p) ==
+    /\ prop_state[p].state = "leader"
+    /\ LET
+           \* entry consists only of term, index serves as LSN.
+           new_entry == prop_state[p].term
+       IN
+           /\ prop_state' = [prop_state EXCEPT ![p].wal = Append(prop_state[p].wal, new_entry)]
+           /\ UNCHANGED <<acc_state, committed, elected_history>>
+
+\* Immediately append next entry from elected proposer to acceptor a.
+AppendEntry(p, a) ==
+    /\ prop_state[p].state = "leader"
+    /\ acc_state[a].term = prop_state[p].term
+    /\ a \in DOMAIN prop_state[p].nextSendLsn \* did TruncateWal
+    /\ prop_state[p].nextSendLsn[a] <= Len(prop_state[p].wal) \* have smth to send
+    /\ LET
+           send_lsn == prop_state[p].nextSendLsn[a]
+           entry == prop_state[p].wal[send_lsn]
+           \* Since message delivery is instant we don't check that send_lsn follows
+           \* the last acc record, it must always be true.
+       IN
+           /\ prop_state' = [prop_state EXCEPT ![p].nextSendLsn[a] = send_lsn + 1]
+           /\ acc_state' = [acc_state EXCEPT ![a].wal = Append(acc_state[a].wal, entry)]
+           /\ UNCHANGED <<committed, elected_history>>
+
+\* LSN where elected proposer p starts writing its records.
+PropStartLsn(p) ==
+    IF prop_state[p].state = "leader" THEN prop_state[p].termHistory[Len(prop_state[p].termHistory)].lsn ELSE NULL
+
+\* Proposer p commits all entries it can using quorum q. Note that unlike
+\* will62794/logless-reconfig this allows to commit entries from previous terms
+\* (when conditions for that are met).
+CommitEntries(p, q) ==
+    /\ prop_state[p].state = "leader"
+    /\ \A a \in q:
+           /\ acc_state[a].term = prop_state[p].term
+             \* nextSendLsn existence means TruncateWal has happened, it ensures
+             \* acceptor's WAL (and FlushLsn) are from proper proposer's history.
+             \* Alternatively we could compare LastLogTerm here, but that's closer to
+             \* what we do in the impl (we check flushLsn in AppendResponse, but
+             \* AppendRequest is processed only if HandleElected handling was good).
+           /\ a \in DOMAIN prop_state[p].nextSendLsn
+    \* Now find the LSN present on all the quorum.
+    /\ LET quorum_lsn == Minimum({FlushLsn(a): a \in q}) IN
+           \* This is the basic Raft rule of not committing entries from previous
+           \* terms except along with current term entry (commit them only when
+           \* quorum recovers, i.e. last_log_term on it reaches leader's term).
+           /\ quorum_lsn >= PropStartLsn(p)
+           /\ committed' = committed \cup {[term |-> prop_state[p].wal[lsn], lsn |-> lsn]: lsn \in 1..(quorum_lsn - 1)}
+           /\ UNCHANGED <<prop_state, acc_state, elected_history>>
+
+\*******************************************************************************
+\* Final spec
+\*******************************************************************************
+
+Next ==
+    \/ \E q \in Quorums: \E p \in proposers: RestartProposer(p, q)
+    \/ \E p \in proposers: \E a \in acceptors: Vote(p, a)
+    \/ \E p \in proposers: BecomeLeader(p)
+    \/ \E p \in proposers: \E a \in acceptors: UpdateTerm(p, a)
+    \/ \E p \in proposers: \E a \in acceptors: TruncateWal(p, a)
+    \/ \E p \in proposers: NewEntry(p)
+    \/ \E p \in proposers: \E a \in acceptors: AppendEntry(p, a)
+    \/ \E q \in Quorums: \E p \in proposers: CommitEntries(p, q)
+
+Spec == Init /\ [][Next]_<<prop_state, acc_state, committed, elected_history>>
+
+
+\********************************************************************************
+\* Invariants
+\********************************************************************************
+
+\* Lighter version of ElectionSafetyFull which doesn't require elected_history.
+ElectionSafety ==
+    \A p1, p2 \in proposers:
+        (/\ prop_state[p1].state = "leader"
+         /\ prop_state[p2].state = "leader"
+         /\ prop_state[p1].term = prop_state[p2].term) => (p1 = p2)
+
+\* Single term must never be elected more than once.
+ElectionSafetyFull == \A term \in DOMAIN elected_history: elected_history[term] <= 1
+
+\* Log is expected to be monotonic by <term, lsn> comparison. This is not true
+\* in variants of multi Paxos, but in Raft (and here) it is.
+LogIsMonotonic ==
+    \A a \in acceptors:
+        \A i, j \in DOMAIN acc_state[a].wal:
+            (i > j) => (acc_state[a].wal[i] >= acc_state[a].wal[j])
+
+\* Main invariant: If two entries are committed at the same LSN, they must be
+\* the same entry.
+LogSafety ==
+    \A c1, c2 \in committed: (c1.lsn = c2.lsn) => (c1 = c2)
+
+
+\********************************************************************************
+\* Invariants which don't need to hold, but useful for playing/debugging.
+\********************************************************************************
+
+\* Limits term of elected proposers
+MaxTerm == \A p \in proposers: (prop_state[p].state = "leader" => prop_state[p].term < 2)
+
+MaxAccWalLen == \A a \in acceptors: Len(acc_state[a].wal) < 2
+
+\* Limits max number of committed entries. That way we can check that we'are
+\* actually committing something.
+MaxCommitLsn == Cardinality(committed) < 2
+
+\* How many records with different terms can be removed in single WAL
+\* truncation.
+MaxTruncatedTerms ==
+    \A p \in proposers: \A a \in acceptors:
+        (/\ prop_state[p].state = "leader"
+         /\ prop_state[p].term = acc_state[a].term) =>
+            LET
+                hcp == FindHighestCommonPoint(prop_state[p].termHistory, AcceptorTermHistory(a), FlushLsn(a))
+                truncated_lsns == {lsn \in DOMAIN acc_state[a].wal: lsn >= hcp.lsn}
+                truncated_records_terms == {acc_state[a].wal[lsn]: lsn \in truncated_lsns}
+            IN
+                Cardinality(truncated_records_terms) < 2
+
+\* Check that TruncateWal never deletes committed record.
+\* It might seem that this should an invariant, but it is not.
+\* With 5 nodes, it is legit to truncate record which had been
+\* globally committed: e.g. nodes abc can commit record of term 1 in
+\* term 3, and after that leader of term 2 can delete such record
+\* on d. On 10 cores TLC can find such a trace in ~7 hours.
+CommittedNotTruncated ==
+    \A p \in proposers: \A a \in acceptors:
+        (/\ prop_state[p].state = "leader"
+         /\ prop_state[p].term = acc_state[a].term) =>
+            LET
+               hcp == FindHighestCommonPoint(prop_state[p].termHistory, AcceptorTermHistory(a), FlushLsn(a))
+               truncated_lsns == {lsn \in DOMAIN acc_state[a].wal: lsn >= hcp.lsn}
+               truncated_records == {[term |-> acc_state[a].wal[lsn], lsn |-> lsn]: lsn \in truncated_lsns}
+            IN
+               \A r \in truncated_records: r \notin committed
+
+====
diff --git a/safekeeper/spec/modelcheck.sh b/safekeeper/spec/modelcheck.sh
new file mode 100755
index 000000000000..21ead7dad860
--- /dev/null
+++ b/safekeeper/spec/modelcheck.sh
@@ -0,0 +1,49 @@
+#!/bin/bash
+
+# Usage: ./modelcheck.sh <config_file> <spec_file>, e.g.
+# ./modelcheck.sh models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg MCProposerAcceptorStatic.tla
+CONFIG=$1
+SPEC=$2
+
+MEM=7G
+TOOLSPATH="/opt/TLA+Toolbox/tla2tools.jar"
+
+mkdir -p "tlc-results"
+CONFIG_FILE=$(basename -- "$CONFIG")
+outfilename="$SPEC-${CONFIG_FILE}-$(date --utc +%Y-%m-%d--%H-%M-%S)".log
+outfile="tlc-results/$outfilename"
+touch $outfile
+
+# Save some info about the run.
+GIT_REV=`git rev-parse --short HEAD`
+INFO=`uname -a`
+
+# First for Linux, second for Mac.
+CPUNAMELinux=$(lscpu | grep 'Model name' | cut -f 2 -d ":" | awk '{$1=$1}1')
+CPUCORESLinux=`nproc`
+CPUNAMEMac=`sysctl -n machdep.cpu.brand_string`
+CPUCORESMac=`sysctl -n machdep.cpu.thread_count`
+
+echo "git revision: $GIT_REV" >> $outfile
+echo "Platform: $INFO" >> $outfile
+echo "CPU Info Linux: $CPUNAMELinux" >> $outfile
+echo "CPU Cores Linux: $CPUCORESLinux" >> $outfile
+echo "CPU Info Mac: $CPUNAMEMac" >> $outfile
+echo "CPU Cores Mac: $CPUCORESMac" >> $outfile
+echo "Spec: $SPEC" >> $outfile
+echo "Config: $CONFIG" >> $outfile
+echo "----" >> $outfile
+cat $CONFIG >> $outfile
+echo "" >> $outfile
+echo "----" >> $outfile
+echo "" >> $outfile
+
+# see
+# https://lamport.azurewebsites.net/tla/current-tools.pdf
+# for TLC options.
+# OffHeapDiskFPSet is the optimal fingerprint set implementation
+# https://docs.tlapl.us/codebase:architecture#fingerprint_sets_fpsets
+#
+# Add -simulate to run in infinite simulation mode.
+java -Xmx$MEM -XX:MaxDirectMemorySize=$MEM -XX:+UseParallelGC -Dtlc2.tool.fp.FPSet.impl=tlc2.tool.fp.OffHeapDiskFPSet \
+  -cp "${TOOLSPATH}" tlc2.TLC $SPEC -config $CONFIG -workers auto -gzip | tee -a $outfile
diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t2_l2.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t2_l2.cfg
new file mode 100644
index 000000000000..c06109c60110
--- /dev/null
+++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t2_l2.cfg
@@ -0,0 +1,19 @@
+\* A very small model just to play.
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3}
+max_term = 2
+max_entries = 2
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafetyFull
+LogIsMonotonic
+LogSafety
+CommittedNotTruncated
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
+
diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l2.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l2.cfg
new file mode 100644
index 000000000000..5d10fa960f06
--- /dev/null
+++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l2.cfg
@@ -0,0 +1,19 @@
+\* A model next to the smallest one.
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3}
+max_term = 3
+max_entries = 2
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafetyFull
+LogIsMonotonic
+LogSafety
+CommittedNotTruncated
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
+
diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg
new file mode 100644
index 000000000000..8ba8ce95a410
--- /dev/null
+++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg
@@ -0,0 +1,17 @@
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3}
+max_term = 3
+max_entries = 3
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafety
+LogIsMonotonic
+LogSafety
+CommittedNotTruncated
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t4_l4.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t4_l4.cfg
new file mode 100644
index 000000000000..4763a34ec410
--- /dev/null
+++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a3_t4_l4.cfg
@@ -0,0 +1,17 @@
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3}
+max_term = 4
+max_entries = 4
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafety
+LogIsMonotonic
+LogSafety
+CommittedNotTruncated
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t2_l2.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t2_l2.cfg
new file mode 100644
index 000000000000..ebf4724633fd
--- /dev/null
+++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t2_l2.cfg
@@ -0,0 +1,16 @@
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3, a4, a5}
+max_term = 2
+max_entries = 2
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafety
+LogIsMonotonic
+LogSafety
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t3_l3.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t3_l3.cfg
new file mode 100644
index 000000000000..bb77350c58a1
--- /dev/null
+++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t3_l3.cfg
@@ -0,0 +1,16 @@
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3, a4, a5}
+max_term = 3
+max_entries = 3
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafety
+LogIsMonotonic
+LogSafety
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
diff --git a/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t4_l3.cfg b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t4_l3.cfg
new file mode 100644
index 000000000000..9a5e142f9925
--- /dev/null
+++ b/safekeeper/spec/models/MCProposerAcceptorStatic_p2_a5_t4_l3.cfg
@@ -0,0 +1,16 @@
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3, a4, a5}
+max_term = 4
+max_entries = 3
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafety
+LogIsMonotonic
+LogSafety
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
diff --git a/safekeeper/spec/readme.md b/safekeeper/spec/readme.md
new file mode 100644
index 000000000000..ec2649d87da5
--- /dev/null
+++ b/safekeeper/spec/readme.md
@@ -0,0 +1,12 @@
+The specifications, models and results of running of them of the compute <->
+safekeepers consensus algorithm for committing WAL on the fleet of safekeepers.
+Following Paxos parlance, compute which writes WAL is called (WAL) proposer here
+and safekeepers which persist it are called (WAL) acceptors.
+
+Directory structure:
+- Use modelcheck.sh to run TLC.
+- MC*.tla contains bits of TLA+ needed for TLC like constraining the state space, and models/ actual models.
+- Other .tla files are the actual specs.
+
+Structure is partially borrowed from
+[logless-reconfig](https://github.com/will62794/logless-reconfig), thanks to it.
diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t2_l2.cfg-2024-11-06--13-44-17.log b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t2_l2.cfg-2024-11-06--13-44-17.log
new file mode 100644
index 000000000000..768722b1eb41
--- /dev/null
+++ b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t2_l2.cfg-2024-11-06--13-44-17.log
@@ -0,0 +1,63 @@
+git revision: 864f4667d
+Platform: Linux neon-dev-arm64-1 6.8.0-48-generic #48-Ubuntu SMP PREEMPT_DYNAMIC Fri Sep 27 14:35:45 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux
+CPU Info Linux: Neoverse-N1
+CPU Cores Linux: 80
+CPU Info Mac: 
+CPU Cores Mac: 
+Spec: MCProposerAcceptorStatic.tla
+Config: models/MCProposerAcceptorStatic_p2_a3_t2_l2.cfg
+----
+\* A very small model just to play.
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3}
+max_term = 2
+max_entries = 2
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafety
+LogIsMonotonic
+LogSafety
+CommittedNotTruncated
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
+
+
+----
+
+TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71)
+Running breadth-first search Model-Checking with fp 110 and seed 3949669318051689745 with 80 workers on 80 cores with 54613MB heap and 61440MB offheap memory [pid: 46037] (Linux 6.8.0-48-generic aarch64, Ubuntu 21.0.4 x86_64, OffHeapDiskFPSet, DiskStateQueue).
+Parsing file /home/arseny/neon/safekeeper/spec/MCProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-11123278435718411444/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla)
+Parsing file /home/arseny/neon/safekeeper/spec/ProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-11123278435718411444/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla)
+Parsing file /tmp/tlc-11123278435718411444/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla)
+Parsing file /tmp/tlc-11123278435718411444/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla)
+Parsing file /tmp/tlc-11123278435718411444/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla)
+Parsing file /tmp/tlc-11123278435718411444/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla)
+Parsing file /tmp/tlc-11123278435718411444/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla)
+Semantic processing of module Naturals
+Semantic processing of module Sequences
+Semantic processing of module FiniteSets
+Semantic processing of module TLC
+Semantic processing of module Integers
+Semantic processing of module ProposerAcceptorStatic
+Semantic processing of module TLCExt
+Semantic processing of module _TLCTrace
+Semantic processing of module MCProposerAcceptorStatic
+Starting... (2024-11-06 13:44:18)
+Computing initial states...
+Finished computing initial states: 1 distinct state generated at 2024-11-06 13:44:20.
+Model checking completed. No error has been found.
+  Estimates of the probability that TLC did not check all reachable states
+  because two distinct states had the same fingerprint:
+  calculated (optimistic):  val = 2.9E-9
+  based on the actual fingerprints:  val = 4.1E-10
+922134 states generated, 61249 distinct states found, 0 states left on queue.
+The depth of the complete state graph search is 31.
+The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 6 and the 95th percentile is 3).
+Finished in 11s at (2024-11-06 13:44:28)
diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l2.cfg-2024-11-15--09-09-58.log b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l2.cfg-2024-11-15--09-09-58.log
new file mode 100644
index 000000000000..ae3ba98da61e
--- /dev/null
+++ b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l2.cfg-2024-11-15--09-09-58.log
@@ -0,0 +1,69 @@
+git revision: bcbff084a
+Platform: Linux nonlibrem 6.10.11-amd64 #1 SMP PREEMPT_DYNAMIC Debian 6.10.11-1 (2024-09-22) x86_64 GNU/Linux
+CPU Info Linux: 13th Gen Intel(R) Core(TM) i7-1355U
+CPU Cores Linux: 10
+CPU Info Mac: 
+CPU Cores Mac: 
+Spec: MCProposerAcceptorStatic.tla
+Config: models/MCProposerAcceptorStatic_p2_a3_t3_l2.cfg
+----
+\* A model next to the smallest one.
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3}
+max_term = 3
+max_entries = 2
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafety
+LogIsMonotonic
+LogSafety
+CommittedNotTruncated
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
+
+
+----
+
+TLC2 Version 2.20 of Day Month 20?? (rev: cc65eef)
+Running breadth-first search Model-Checking with fp 41 and seed -3061068726727581619 with 10 workers on 10 cores with 6372MB heap and 7168MB offheap memory [pid: 1250346] (Linux 6.10.11-amd64 amd64, Debian 21.0.5 x86_64, OffHeapDiskFPSet, DiskStateQueue).
+Parsing file /home/ars/neon/neon/safekeeper/spec/MCProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-3023124431504466774/TLC.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/TLC.tla)
+Parsing file /home/ars/neon/neon/safekeeper/spec/ProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-3023124431504466774/_TLCTrace.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla)
+Parsing file /tmp/tlc-3023124431504466774/Integers.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/Integers.tla)
+Parsing file /tmp/tlc-3023124431504466774/Sequences.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla)
+Parsing file /tmp/tlc-3023124431504466774/FiniteSets.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla)
+Parsing file /tmp/tlc-3023124431504466774/Naturals.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla)
+Parsing file /tmp/tlc-3023124431504466774/TLCExt.tla (jar:file:/opt/TLA+Toolbox/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla)
+Semantic processing of module Naturals
+Semantic processing of module Sequences
+Semantic processing of module FiniteSets
+Semantic processing of module TLC
+Semantic processing of module Integers
+Semantic processing of module ProposerAcceptorStatic
+Semantic processing of module TLCExt
+Semantic processing of module _TLCTrace
+Semantic processing of module MCProposerAcceptorStatic
+Starting... (2024-11-15 12:09:59)
+Computing initial states...
+Finished computing initial states: 1 distinct state generated at 2024-11-15 12:10:00.
+Progress(19) at 2024-11-15 12:10:03: 464,696 states generated (464,696 s/min), 57,859 distinct states found (57,859 ds/min), 21,435 states left on queue.
+Progress(26) at 2024-11-15 12:11:03: 8,813,399 states generated (8,348,703 s/min), 877,254 distinct states found (819,395 ds/min), 214,794 states left on queue.
+Progress(27) at 2024-11-15 12:12:03: 16,121,858 states generated (7,308,459 s/min), 1,464,707 distinct states found (587,453 ds/min), 274,230 states left on queue.
+Progress(29) at 2024-11-15 12:13:03: 23,073,903 states generated (6,952,045 s/min), 1,948,802 distinct states found (484,095 ds/min), 263,697 states left on queue.
+Progress(31) at 2024-11-15 12:14:03: 29,740,681 states generated (6,666,778 s/min), 2,331,052 distinct states found (382,250 ds/min), 185,484 states left on queue.
+Progress(34) at 2024-11-15 12:15:03: 36,085,876 states generated (6,345,195 s/min), 2,602,370 distinct states found (271,318 ds/min), 31,659 states left on queue.
+Model checking completed. No error has been found.
+  Estimates of the probability that TLC did not check all reachable states
+  because two distinct states had the same fingerprint:
+  calculated (optimistic):  val = 4.9E-6
+  based on the actual fingerprints:  val = 6.9E-7
+36896322 states generated, 2623542 distinct states found, 0 states left on queue.
+The depth of the complete state graph search is 39.
+The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 7 and the 95th percentile is 3).
+Finished in 05min 14s at (2024-11-15 12:15:13)
diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l3.cfg-2024-11-06--13-03-51.log b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l3.cfg-2024-11-06--13-03-51.log
new file mode 100644
index 000000000000..46f21cee72ce
--- /dev/null
+++ b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t3_l3.cfg-2024-11-06--13-03-51.log
@@ -0,0 +1,72 @@
+git revision: 864f4667d
+Platform: Linux neon-dev-arm64-1 6.8.0-48-generic #48-Ubuntu SMP PREEMPT_DYNAMIC Fri Sep 27 14:35:45 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux
+CPU Info Linux: Neoverse-N1
+CPU Cores Linux: 80
+CPU Info Mac: 
+CPU Cores Mac: 
+Spec: MCProposerAcceptorStatic.tla
+Config: models/MCProposerAcceptorStatic_p2_a3_t3_l3.cfg
+----
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3}
+max_term = 3
+max_entries = 3
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafety
+LogIsMonotonic
+LogSafety
+CommittedNotTruncated
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
+
+----
+
+TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71)
+Running breadth-first search Model-Checking with fp 126 and seed 2302892334567572769 with 80 workers on 80 cores with 54613MB heap and 61440MB offheap memory [pid: 39701] (Linux 6.8.0-48-generic aarch64, Ubuntu 21.0.4 x86_64, OffHeapDiskFPSet, DiskStateQueue).
+Parsing file /home/arseny/neon/safekeeper/spec/MCProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-15178810317173795942/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla)
+Parsing file /home/arseny/neon/safekeeper/spec/ProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-15178810317173795942/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla)
+Parsing file /tmp/tlc-15178810317173795942/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla)
+Parsing file /tmp/tlc-15178810317173795942/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla)
+Parsing file /tmp/tlc-15178810317173795942/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla)
+Parsing file /tmp/tlc-15178810317173795942/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla)
+Parsing file /tmp/tlc-15178810317173795942/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla)
+Semantic processing of module Naturals
+Semantic processing of module Sequences
+Semantic processing of module FiniteSets
+Semantic processing of module TLC
+Semantic processing of module Integers
+Semantic processing of module ProposerAcceptorStatic
+Semantic processing of module TLCExt
+Semantic processing of module _TLCTrace
+Semantic processing of module MCProposerAcceptorStatic
+Starting... (2024-11-06 13:03:52)
+Computing initial states...
+Finished computing initial states: 1 distinct state generated at 2024-11-06 13:03:55.
+Progress(21) at 2024-11-06 13:03:58: 846,240 states generated (846,240 s/min), 106,298 distinct states found (106,298 ds/min), 41,028 states left on queue.
+Progress(28) at 2024-11-06 13:04:58: 27,538,211 states generated (26,691,971 s/min), 2,768,793 distinct states found (2,662,495 ds/min), 782,984 states left on queue.
+Progress(30) at 2024-11-06 13:05:58: 54,048,763 states generated (26,510,552 s/min), 5,076,745 distinct states found (2,307,952 ds/min), 1,241,301 states left on queue.
+Progress(31) at 2024-11-06 13:06:58: 80,554,724 states generated (26,505,961 s/min), 7,199,201 distinct states found (2,122,456 ds/min), 1,541,574 states left on queue.
+Progress(32) at 2024-11-06 13:07:58: 106,991,261 states generated (26,436,537 s/min), 9,121,549 distinct states found (1,922,348 ds/min), 1,686,289 states left on queue.
+Progress(33) at 2024-11-06 13:08:58: 133,354,665 states generated (26,363,404 s/min), 10,935,451 distinct states found (1,813,902 ds/min), 1,739,977 states left on queue.
+Progress(34) at 2024-11-06 13:09:58: 159,631,385 states generated (26,276,720 s/min), 12,605,372 distinct states found (1,669,921 ds/min), 1,677,447 states left on queue.
+Progress(35) at 2024-11-06 13:10:58: 185,862,196 states generated (26,230,811 s/min), 14,138,409 distinct states found (1,533,037 ds/min), 1,501,760 states left on queue.
+Progress(36) at 2024-11-06 13:11:58: 212,021,688 states generated (26,159,492 s/min), 15,538,990 distinct states found (1,400,581 ds/min), 1,216,621 states left on queue.
+Progress(37) at 2024-11-06 13:12:58: 238,046,160 states generated (26,024,472 s/min), 16,778,583 distinct states found (1,239,593 ds/min), 797,230 states left on queue.
+Progress(39) at 2024-11-06 13:13:58: 263,931,163 states generated (25,885,003 s/min), 17,820,786 distinct states found (1,042,203 ds/min), 209,400 states left on queue.
+Model checking completed. No error has been found.
+  Estimates of the probability that TLC did not check all reachable states
+  because two distinct states had the same fingerprint:
+  calculated (optimistic):  val = 2.5E-4
+  based on the actual fingerprints:  val = 7.9E-5
+270257170 states generated, 18005639 distinct states found, 0 states left on queue.
+The depth of the complete state graph search is 47.
+The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 7 and the 95th percentile is 3).
+Finished in 10min 25s at (2024-11-06 13:14:17)
diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--14-20-25.log b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--14-20-25.log
new file mode 100644
index 000000000000..c7cc853af0e0
--- /dev/null
+++ b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--14-20-25.log
@@ -0,0 +1,1466 @@
+# Shows LogSafety violation when "don't commit separately entries from previous terms" check is disabled.
+git revision: 4f1ee6331
+Platform: Linux neon-dev-arm64-1 6.8.0-48-generic #48-Ubuntu SMP PREEMPT_DYNAMIC Fri Sep 27 14:35:45 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux
+CPU Info Linux: Neoverse-N1
+CPU Cores Linux: 80
+CPU Info Mac: 
+CPU Cores Mac: 
+Spec: MCProposerAcceptorStatic.tla
+Config: models/MCProposerAcceptorStatic_p2_a3_t4_l4.cfg
+----
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3}
+max_term = 4
+max_entries = 4
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafety
+LogIsMonotonic
+LogSafety
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
+
+----
+
+TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71)
+Running breadth-first search Model-Checking with fp 12 and seed -5379034126224420237 with 80 workers on 80 cores with 54613MB heap and 61440MB offheap memory [pid: 52295] (Linux 6.8.0-48-generic aarch64, Ubuntu 21.0.4 x86_64, OffHeapDiskFPSet, DiskStateQueue).
+Parsing file /home/arseny/neon/safekeeper/spec/MCProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-4533438058229992850/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla)
+Parsing file /home/arseny/neon/safekeeper/spec/ProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-4533438058229992850/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla)
+Parsing file /tmp/tlc-4533438058229992850/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla)
+Parsing file /tmp/tlc-4533438058229992850/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla)
+Parsing file /tmp/tlc-4533438058229992850/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla)
+Parsing file /tmp/tlc-4533438058229992850/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla)
+Parsing file /tmp/tlc-4533438058229992850/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla)
+Semantic processing of module Naturals
+Semantic processing of module Sequences
+Semantic processing of module FiniteSets
+Semantic processing of module TLC
+Semantic processing of module Integers
+Semantic processing of module ProposerAcceptorStatic
+Semantic processing of module TLCExt
+Semantic processing of module _TLCTrace
+Semantic processing of module MCProposerAcceptorStatic
+Starting... (2024-11-06 14:20:26)
+Computing initial states...
+Finished computing initial states: 1 distinct state generated at 2024-11-06 14:20:29.
+Progress(20) at 2024-11-06 14:20:32: 1,011,898 states generated (1,011,898 s/min), 140,947 distinct states found (140,947 ds/min), 60,535 states left on queue.
+Progress(26) at 2024-11-06 14:21:32: 30,146,518 states generated (29,134,620 s/min), 3,742,736 distinct states found (3,601,789 ds/min), 1,438,779 states left on queue.
+Progress(27) at 2024-11-06 14:22:32: 59,362,708 states generated (29,216,190 s/min), 7,210,233 distinct states found (3,467,497 ds/min), 2,708,295 states left on queue.
+Progress(28) at 2024-11-06 14:23:32: 88,589,291 states generated (29,226,583 s/min), 10,552,781 distinct states found (3,342,548 ds/min), 3,874,296 states left on queue.
+Progress(29) at 2024-11-06 14:24:32: 117,894,209 states generated (29,304,918 s/min), 13,932,498 distinct states found (3,379,717 ds/min), 5,069,960 states left on queue.
+Progress(29) at 2024-11-06 14:25:32: 147,338,882 states generated (29,444,673 s/min), 17,180,069 distinct states found (3,247,571 ds/min), 6,146,371 states left on queue.
+Progress(29) at 2024-11-06 14:26:32: 176,498,135 states generated (29,159,253 s/min), 20,547,926 distinct states found (3,367,857 ds/min), 7,338,835 states left on queue.
+Progress(30) at 2024-11-06 14:27:32: 205,957,044 states generated (29,458,909 s/min), 23,661,090 distinct states found (3,113,164 ds/min), 8,293,570 states left on queue.
+Progress(30) at 2024-11-06 14:28:32: 235,390,133 states generated (29,433,089 s/min), 26,892,306 distinct states found (3,231,216 ds/min), 9,369,229 states left on queue.
+Progress(30) at 2024-11-06 14:29:32: 264,571,938 states generated (29,181,805 s/min), 30,176,971 distinct states found (3,284,665 ds/min), 10,493,429 states left on queue.
+Progress(31) at 2024-11-06 14:30:32: 293,928,191 states generated (29,356,253 s/min), 33,296,160 distinct states found (3,119,189 ds/min), 11,463,686 states left on queue.
+Progress(31) at 2024-11-06 14:31:32: 323,436,668 states generated (29,508,477 s/min), 36,347,973 distinct states found (3,051,813 ds/min), 12,365,578 states left on queue.
+Progress(31) at 2024-11-06 14:32:32: 352,943,790 states generated (29,507,122 s/min), 39,465,244 distinct states found (3,117,271 ds/min), 13,349,544 states left on queue.
+Progress(31) at 2024-11-06 14:33:32: 382,292,863 states generated (29,349,073 s/min), 42,654,621 distinct states found (3,189,377 ds/min), 14,384,363 states left on queue.
+Progress(31) at 2024-11-06 14:34:32: 411,385,854 states generated (29,092,991 s/min), 45,941,145 distinct states found (3,286,524 ds/min), 15,509,450 states left on queue.
+Progress(31) at 2024-11-06 14:35:32: 440,738,756 states generated (29,352,902 s/min), 48,984,566 distinct states found (3,043,421 ds/min), 16,419,882 states left on queue.
+Progress(32) at 2024-11-06 14:36:32: 470,251,558 states generated (29,512,802 s/min), 51,925,693 distinct states found (2,941,127 ds/min), 17,211,457 states left on queue.
+Progress(32) at 2024-11-06 14:37:32: 499,714,013 states generated (29,462,455 s/min), 54,955,581 distinct states found (3,029,888 ds/min), 18,114,624 states left on queue.
+Progress(32) at 2024-11-06 14:38:32: 529,254,608 states generated (29,540,595 s/min), 57,938,914 distinct states found (2,983,333 ds/min), 18,996,128 states left on queue.
+Progress(32) at 2024-11-06 14:39:32: 558,774,398 states generated (29,519,790 s/min), 61,072,943 distinct states found (3,134,029 ds/min), 19,975,689 states left on queue.
+Progress(32) at 2024-11-06 14:40:32: 588,134,665 states generated (29,360,267 s/min), 64,148,888 distinct states found (3,075,945 ds/min), 20,922,407 states left on queue.
+Progress(32) at 2024-11-06 14:41:32: 617,464,374 states generated (29,329,709 s/min), 67,306,855 distinct states found (3,157,967 ds/min), 21,928,799 states left on queue.
+Progress(32) at 2024-11-06 14:42:32: 646,525,281 states generated (29,060,907 s/min), 70,425,194 distinct states found (3,118,339 ds/min), 22,895,971 states left on queue.
+Progress(32) at 2024-11-06 14:43:32: 676,054,893 states generated (29,529,612 s/min), 73,351,905 distinct states found (2,926,711 ds/min), 23,703,779 states left on queue.
+Progress(33) at 2024-11-06 14:44:32: 705,581,782 states generated (29,526,889 s/min), 76,200,615 distinct states found (2,848,710 ds/min), 24,414,094 states left on queue.
+Progress(33) at 2024-11-06 14:45:32: 735,069,836 states generated (29,488,054 s/min), 79,168,244 distinct states found (2,967,629 ds/min), 25,255,224 states left on queue.
+Progress(33) at 2024-11-06 14:46:32: 764,659,188 states generated (29,589,352 s/min), 82,024,430 distinct states found (2,856,186 ds/min), 26,011,047 states left on queue.
+Progress(33) at 2024-11-06 14:47:32: 794,276,423 states generated (29,617,235 s/min), 84,974,312 distinct states found (2,949,882 ds/min), 26,868,750 states left on queue.
+Progress(33) at 2024-11-06 14:48:32: 823,875,831 states generated (29,599,408 s/min), 88,004,386 distinct states found (3,030,074 ds/min), 27,771,984 states left on queue.
+Progress(33) at 2024-11-06 14:49:32: 853,138,894 states generated (29,263,063 s/min), 91,006,890 distinct states found (3,002,504 ds/min), 28,636,661 states left on queue.
+Checkpointing of run states/24-11-06-14-20-25.868
+Checkpointing completed at (2024-11-06 14:50:32)
+Progress(33) at 2024-11-06 14:50:32: 882,514,167 states generated (29,375,273 s/min), 94,011,000 distinct states found (3,004,110 ds/min), 29,534,516 states left on queue.
+Progress(33) at 2024-11-06 14:51:32: 911,838,377 states generated (29,324,210 s/min), 97,108,937 distinct states found (3,097,937 ds/min), 30,498,587 states left on queue.
+Progress(33) at 2024-11-06 14:52:32: 940,646,920 states generated (28,808,543 s/min), 100,248,865 distinct states found (3,139,928 ds/min), 31,472,191 states left on queue.
+Progress(33) at 2024-11-06 14:53:32: 970,074,175 states generated (29,427,255 s/min), 103,170,815 distinct states found (2,921,950 ds/min), 32,265,691 states left on queue.
+Progress(33) at 2024-11-06 14:54:32: 999,627,974 states generated (29,553,799 s/min), 106,004,823 distinct states found (2,834,008 ds/min), 33,009,618 states left on queue.
+Progress(34) at 2024-11-06 14:55:32: 1,029,148,983 states generated (29,521,009 s/min), 108,740,783 distinct states found (2,735,960 ds/min), 33,616,222 states left on queue.
+Progress(34) at 2024-11-06 14:56:32: 1,058,582,001 states generated (29,433,018 s/min), 111,612,965 distinct states found (2,872,182 ds/min), 34,375,212 states left on queue.
+Progress(34) at 2024-11-06 14:57:32: 1,088,123,602 states generated (29,541,601 s/min), 114,464,196 distinct states found (2,851,231 ds/min), 35,116,195 states left on queue.
+Progress(34) at 2024-11-06 14:58:32: 1,117,684,936 states generated (29,561,334 s/min), 117,252,198 distinct states found (2,788,002 ds/min), 35,817,205 states left on queue.
+Progress(34) at 2024-11-06 14:59:32: 1,147,356,249 states generated (29,671,313 s/min), 120,014,476 distinct states found (2,762,278 ds/min), 36,517,255 states left on queue.
+Progress(34) at 2024-11-06 15:00:32: 1,176,921,098 states generated (29,564,849 s/min), 122,859,312 distinct states found (2,844,836 ds/min), 37,291,096 states left on queue.
+Progress(34) at 2024-11-06 15:01:32: 1,206,454,440 states generated (29,533,342 s/min), 125,830,942 distinct states found (2,971,630 ds/min), 38,147,762 states left on queue.
+Progress(34) at 2024-11-06 15:02:32: 1,235,721,673 states generated (29,267,233 s/min), 128,869,493 distinct states found (3,038,551 ds/min), 39,035,481 states left on queue.
+Progress(34) at 2024-11-06 15:03:32: 1,265,097,779 states generated (29,376,106 s/min), 131,669,552 distinct states found (2,800,059 ds/min), 39,746,864 states left on queue.
+Progress(34) at 2024-11-06 15:04:32: 1,294,408,098 states generated (29,310,319 s/min), 134,604,630 distinct states found (2,935,078 ds/min), 40,584,235 states left on queue.
+Progress(34) at 2024-11-06 15:05:32: 1,323,792,755 states generated (29,384,657 s/min), 137,579,390 distinct states found (2,974,760 ds/min), 41,446,478 states left on queue.
+Progress(34) at 2024-11-06 15:06:32: 1,353,085,163 states generated (29,292,408 s/min), 140,575,723 distinct states found (2,996,333 ds/min), 42,309,510 states left on queue.
+Progress(34) at 2024-11-06 15:07:32: 1,381,809,417 states generated (28,724,254 s/min), 143,655,566 distinct states found (3,079,843 ds/min), 43,220,682 states left on queue.
+Progress(34) at 2024-11-06 15:08:32: 1,411,255,848 states generated (29,446,431 s/min), 146,482,192 distinct states found (2,826,626 ds/min), 43,944,938 states left on queue.
+Progress(34) at 2024-11-06 15:09:32: 1,440,646,323 states generated (29,390,475 s/min), 149,419,989 distinct states found (2,937,797 ds/min), 44,763,293 states left on queue.
+Progress(34) at 2024-11-06 15:10:32: 1,470,298,568 states generated (29,652,245 s/min), 152,041,419 distinct states found (2,621,430 ds/min), 45,311,911 states left on queue.
+Progress(35) at 2024-11-06 15:11:32: 1,499,747,712 states generated (29,449,144 s/min), 154,696,867 distinct states found (2,655,448 ds/min), 45,842,895 states left on queue.
+Progress(35) at 2024-11-06 15:12:32: 1,529,256,993 states generated (29,509,281 s/min), 157,493,365 distinct states found (2,796,498 ds/min), 46,535,472 states left on queue.
+Progress(35) at 2024-11-06 15:13:32: 1,558,829,306 states generated (29,572,313 s/min), 160,256,575 distinct states found (2,763,210 ds/min), 47,212,471 states left on queue.
+Progress(35) at 2024-11-06 15:14:32: 1,588,345,878 states generated (29,516,572 s/min), 163,002,602 distinct states found (2,746,027 ds/min), 47,862,117 states left on queue.
+Progress(35) at 2024-11-06 15:15:32: 1,617,885,675 states generated (29,539,797 s/min), 165,699,121 distinct states found (2,696,519 ds/min), 48,472,896 states left on queue.
+Progress(35) at 2024-11-06 15:16:32: 1,647,559,965 states generated (29,674,290 s/min), 168,343,286 distinct states found (2,644,165 ds/min), 49,065,377 states left on queue.
+Progress(35) at 2024-11-06 15:17:32: 1,677,033,250 states generated (29,473,285 s/min), 171,134,409 distinct states found (2,791,123 ds/min), 49,823,330 states left on queue.
+Progress(35) at 2024-11-06 15:18:32: 1,706,730,266 states generated (29,697,016 s/min), 173,860,974 distinct states found (2,726,565 ds/min), 50,493,221 states left on queue.
+Error: Invariant LogSafety is violated.
+Error: The behavior up to this point is:
+State 1: <Initial predicate>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |-> <<>>,
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |-> <<>>,
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a2 :>
+      [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a3 :>
+      [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 2: <Vote(p1,a1) line 207, col 2 to line 214, col 27 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |-> <<>>,
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a2 :>
+      [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a3 :>
+      [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 3: <RestartProposer(p2,{a1, a2}) line 188, col 3 to line 198, col 44 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |-> <<>>,
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a2 :>
+      [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a3 :>
+      [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 4: <Vote(p1,a2) line 207, col 2 to line 214, col 27 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |-> <<>>,
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a2 :>
+      [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a3 :>
+      [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 5: <BecomeLeader(p1) line 222, col 3 to line 245, col 44 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |-> <<>>,
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a2 :>
+      [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a3 :>
+      [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 6: <Vote(p2,a1) line 207, col 2 to line 214, col 27 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a2 :>
+      [term |-> 1, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a3 :>
+      [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 7: <TruncateWal(p1,a2) line 280, col 3 to line 293, col 33 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> (a2 :> 1) ] @@
+  p2 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 0, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 8: <Vote(p2,a3) line 207, col 2 to line 214, col 27 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> (a2 :> 1) ] @@
+  p2 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 9: <BecomeLeader(p2) line 222, col 3 to line 245, col 44 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> (a2 :> 1) ] @@
+  p2 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 10: <TruncateWal(p2,a1) line 280, col 3 to line 293, col 33 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> (a2 :> 1) ] @@
+  p2 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+        nextSendLsn |-> (a1 :> 1) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 11: <RestartProposer(p2,{a1, a2}) line 188, col 3 to line 198, col 44 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> (a2 :> 1) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |-> <<>>,
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 2, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 12: <Vote(p2,a3) line 207, col 2 to line 214, col 27 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> (a2 :> 1) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 13: <NewEntry(p1) line 297, col 3 to line 303, col 44 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<1>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> (a2 :> 1) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 14: <NewEntry(p1) line 297, col 3 to line 303, col 44 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> (a2 :> 1) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 15: <AppendEntry(p1,a2) line 307, col 3 to line 319, col 33 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> (a2 :> 2) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<1>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 16: <AppendEntry(p1,a2) line 307, col 3 to line 319, col 33 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 1,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a2 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+        nextSendLsn |-> (a2 :> 3) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 17: <RestartProposer(p1,{a1, a3}) line 188, col 3 to line 198, col 44 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |-> <<>>,
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 2,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 18: <Vote(p1,a1) line 207, col 2 to line 214, col 27 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 1,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 19: <Vote(p2,a2) line 207, col 2 to line 214, col 27 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 20: <BecomeLeader(p2) line 222, col 3 to line 245, col 44 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> <<>> ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>> ] @@
+  a3 :>
+      [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 21: <TruncateWal(p2,a2) line 280, col 3 to line 293, col 33 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [term |-> 3, wal |-> <<>>, termHistory |-> <<[term |-> 0, lsn |-> 1]>>] )
+/\ committed = {}
+
+State 22: <TruncateWal(p2,a3) line 280, col 3 to line 293, col 33 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3 @@ a3 :> 1) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [ term |-> 3,
+        wal |-> <<>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] )
+/\ committed = {}
+
+State 23: <AppendEntry(p2,a3) line 307, col 3 to line 319, col 33 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [ term |-> 3,
+        wal |-> <<1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] )
+/\ committed = {}
+
+State 24: <CommitEntries(p2,{a2, a3}) line 329, col 3 to line 345, col 45 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [ term |-> 3,
+        wal |-> <<1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] )
+/\ committed = {[term |-> 1, lsn |-> 1]}
+
+State 25: <Vote(p1,a3) line 207, col 2 to line 214, col 27 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        state |-> "campaign",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a3 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 2 ] ),
+        termHistory |-> <<>>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [ term |-> 4,
+        wal |-> <<1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] )
+/\ committed = {[term |-> 1, lsn |-> 1]}
+
+State 26: <BecomeLeader(p1) line 222, col 3 to line 245, col 44 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a3 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 2 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >>,
+        nextSendLsn |-> <<>> ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [ term |-> 4,
+        wal |-> <<1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] )
+/\ committed = {[term |-> 1, lsn |-> 1]}
+
+State 27: <TruncateWal(p1,a3) line 280, col 3 to line 293, col 33 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a3 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 2 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >>,
+        nextSendLsn |-> (a3 :> 1) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >> ] )
+/\ committed = {[term |-> 1, lsn |-> 1]}
+
+State 28: <NewEntry(p1) line 297, col 3 to line 303, col 44 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<4>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a3 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 2 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >>,
+        nextSendLsn |-> (a3 :> 1) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >> ] )
+/\ committed = {[term |-> 1, lsn |-> 1]}
+
+State 29: <AppendEntry(p1,a3) line 307, col 3 to line 319, col 33 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<4>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a3 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 2 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >>,
+        nextSendLsn |-> (a3 :> 2) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [ term |-> 4,
+        wal |-> <<4>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >> ] )
+/\ committed = {[term |-> 1, lsn |-> 1]}
+
+State 30: <TruncateWal(p1,a1) line 280, col 3 to line 293, col 33 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<4>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a3 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 2 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >>,
+        nextSendLsn |-> (a1 :> 1 @@ a3 :> 2) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [ term |-> 4,
+        wal |-> <<4>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >> ] )
+/\ committed = {[term |-> 1, lsn |-> 1]}
+
+State 31: <AppendEntry(p1,a1) line 307, col 3 to line 319, col 33 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<4>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a3 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 2 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >>,
+        nextSendLsn |-> (a1 :> 2 @@ a3 :> 2) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<4>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [ term |-> 4,
+        wal |-> <<4>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >> ] )
+/\ committed = {[term |-> 1, lsn |-> 1]}
+
+State 32: <CommitEntries(p1,{a1, a3}) line 329, col 3 to line 345, col 45 of module ProposerAcceptorStatic>
+/\ prop_state = ( p1 :>
+      [ term |-> 4,
+        wal |-> <<4>>,
+        state |-> "leader",
+        votes |->
+            ( a1 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 2, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] @@
+              a3 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 2 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >>,
+        nextSendLsn |-> (a1 :> 2 @@ a3 :> 2) ] @@
+  p2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        state |-> "leader",
+        votes |->
+            ( a2 :>
+                  [ termHistory |->
+                        <<[term |-> 0, lsn |-> 1], [term |-> 1, lsn |-> 1]>>,
+                    flushLsn |-> 3 ] @@
+              a3 :>
+                  [ termHistory |-> <<[term |-> 0, lsn |-> 1]>>,
+                    flushLsn |-> 1 ] ),
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >>,
+        nextSendLsn |-> (a2 :> 3 @@ a3 :> 2) ] )
+/\ acc_state = ( a1 :>
+      [ term |-> 4,
+        wal |-> <<4>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >> ] @@
+  a2 :>
+      [ term |-> 3,
+        wal |-> <<1, 1>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 1, lsn |-> 1],
+               [term |-> 3, lsn |-> 3] >> ] @@
+  a3 :>
+      [ term |-> 4,
+        wal |-> <<4>>,
+        termHistory |->
+            << [term |-> 0, lsn |-> 1],
+               [term |-> 2, lsn |-> 1],
+               [term |-> 4, lsn |-> 1] >> ] )
+/\ committed = {[term |-> 1, lsn |-> 1], [term |-> 4, lsn |-> 1]}
+
+1712918117 states generated, 174460942 distinct states found, 50658619 states left on queue.
+The depth of the complete state graph search is 35.
+Finished in 58min 19s at (2024-11-06 15:18:45)
+Trace exploration spec path: ./MCProposerAcceptorStatic_TTrace_1730902825.tla
diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--15-30-45.log b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--15-30-45.log
new file mode 100644
index 000000000000..8248240dedfb
--- /dev/null
+++ b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a3_t4_l4.cfg-2024-11-06--15-30-45.log
@@ -0,0 +1,1374 @@
+git revision: 4f1ee6331
+Platform: Linux neon-dev-arm64-1 6.8.0-48-generic #48-Ubuntu SMP PREEMPT_DYNAMIC Fri Sep 27 14:35:45 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux
+CPU Info Linux: Neoverse-N1
+CPU Cores Linux: 80
+CPU Info Mac: 
+CPU Cores Mac: 
+Spec: MCProposerAcceptorStatic.tla
+Config: models/MCProposerAcceptorStatic_p2_a3_t4_l4.cfg
+----
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3}
+max_term = 4
+max_entries = 4
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafety
+LogIsMonotonic
+LogSafety
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
+
+----
+
+TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71)
+Running breadth-first search Model-Checking with fp 84 and seed -1069171980999686913 with 80 workers on 80 cores with 54613MB heap and 61440MB offheap memory [pid: 62544] (Linux 6.8.0-48-generic aarch64, Ubuntu 21.0.4 x86_64, OffHeapDiskFPSet, DiskStateQueue).
+Parsing file /home/arseny/neon/safekeeper/spec/MCProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-6542850091824737097/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla)
+Parsing file /home/arseny/neon/safekeeper/spec/ProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-6542850091824737097/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla)
+Parsing file /tmp/tlc-6542850091824737097/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla)
+Parsing file /tmp/tlc-6542850091824737097/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla)
+Parsing file /tmp/tlc-6542850091824737097/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla)
+Parsing file /tmp/tlc-6542850091824737097/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla)
+Parsing file /tmp/tlc-6542850091824737097/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla)
+Semantic processing of module Naturals
+Semantic processing of module Sequences
+Semantic processing of module FiniteSets
+Semantic processing of module TLC
+Semantic processing of module Integers
+Semantic processing of module ProposerAcceptorStatic
+Semantic processing of module TLCExt
+Semantic processing of module _TLCTrace
+Semantic processing of module MCProposerAcceptorStatic
+Starting... (2024-11-06 15:30:45)
+Computing initial states...
+Finished computing initial states: 1 distinct state generated at 2024-11-06 15:30:48.
+Progress(20) at 2024-11-06 15:30:51: 956,386 states generated (956,386 s/min), 134,121 distinct states found (134,121 ds/min), 57,996 states left on queue.
+Progress(27) at 2024-11-06 15:31:51: 30,048,294 states generated (29,091,908 s/min), 3,778,849 distinct states found (3,644,728 ds/min), 1,463,715 states left on queue.
+Progress(28) at 2024-11-06 15:32:51: 59,092,248 states generated (29,043,954 s/min), 7,282,332 distinct states found (3,503,483 ds/min), 2,750,944 states left on queue.
+Progress(29) at 2024-11-06 15:33:51: 88,333,136 states generated (29,240,888 s/min), 10,694,325 distinct states found (3,411,993 ds/min), 3,955,744 states left on queue.
+Progress(29) at 2024-11-06 15:34:51: 117,708,994 states generated (29,375,858 s/min), 14,000,885 distinct states found (3,306,560 ds/min), 5,067,487 states left on queue.
+Progress(30) at 2024-11-06 15:35:51: 146,847,667 states generated (29,138,673 s/min), 17,407,824 distinct states found (3,406,939 ds/min), 6,258,337 states left on queue.
+Progress(30) at 2024-11-06 15:36:51: 176,211,801 states generated (29,364,134 s/min), 20,626,933 distinct states found (3,219,109 ds/min), 7,302,661 states left on queue.
+Progress(31) at 2024-11-06 15:37:51: 205,665,438 states generated (29,453,637 s/min), 23,877,622 distinct states found (3,250,689 ds/min), 8,361,004 states left on queue.
+Progress(31) at 2024-11-06 15:38:51: 234,757,357 states generated (29,091,919 s/min), 27,246,813 distinct states found (3,369,191 ds/min), 9,511,916 states left on queue.
+Progress(31) at 2024-11-06 15:39:51: 264,154,436 states generated (29,397,079 s/min), 30,383,069 distinct states found (3,136,256 ds/min), 10,494,238 states left on queue.
+Progress(31) at 2024-11-06 15:40:51: 293,638,121 states generated (29,483,685 s/min), 33,498,433 distinct states found (3,115,364 ds/min), 11,429,812 states left on queue.
+Progress(32) at 2024-11-06 15:41:51: 323,039,991 states generated (29,401,870 s/min), 36,709,338 distinct states found (3,210,905 ds/min), 12,463,752 states left on queue.
+Progress(32) at 2024-11-06 15:42:51: 352,081,458 states generated (29,041,467 s/min), 39,979,938 distinct states found (3,270,600 ds/min), 13,531,461 states left on queue.
+Progress(32) at 2024-11-06 15:43:51: 381,472,323 states generated (29,390,865 s/min), 43,147,359 distinct states found (3,167,421 ds/min), 14,513,444 states left on queue.
+Progress(32) at 2024-11-06 15:44:51: 410,911,764 states generated (29,439,441 s/min), 46,200,793 distinct states found (3,053,434 ds/min), 15,418,951 states left on queue.
+Progress(32) at 2024-11-06 15:45:51: 440,514,627 states generated (29,602,863 s/min), 49,210,279 distinct states found (3,009,486 ds/min), 16,263,879 states left on queue.
+Progress(33) at 2024-11-06 15:46:51: 470,070,180 states generated (29,555,553 s/min), 52,317,535 distinct states found (3,107,256 ds/min), 17,200,875 states left on queue.
+Progress(33) at 2024-11-06 15:47:51: 499,387,268 states generated (29,317,088 s/min), 55,489,376 distinct states found (3,171,841 ds/min), 18,196,719 states left on queue.
+Progress(33) at 2024-11-06 15:48:51: 528,308,354 states generated (28,921,086 s/min), 58,716,400 distinct states found (3,227,024 ds/min), 19,225,822 states left on queue.
+Progress(33) at 2024-11-06 15:49:51: 557,626,508 states generated (29,318,154 s/min), 61,861,039 distinct states found (3,144,639 ds/min), 20,172,391 states left on queue.
+Progress(33) at 2024-11-06 15:50:51: 587,011,551 states generated (29,385,043 s/min), 64,911,520 distinct states found (3,050,481 ds/min), 21,068,246 states left on queue.
+Progress(33) at 2024-11-06 15:51:51: 616,469,665 states generated (29,458,114 s/min), 67,862,377 distinct states found (2,950,857 ds/min), 21,888,495 states left on queue.
+Progress(33) at 2024-11-06 15:52:51: 646,037,901 states generated (29,568,236 s/min), 70,774,601 distinct states found (2,912,224 ds/min), 22,642,487 states left on queue.
+Progress(33) at 2024-11-06 15:53:51: 675,679,292 states generated (29,641,391 s/min), 73,753,124 distinct states found (2,978,523 ds/min), 23,459,982 states left on queue.
+Progress(34) at 2024-11-06 15:54:51: 705,213,119 states generated (29,533,827 s/min), 76,751,356 distinct states found (2,998,232 ds/min), 24,319,315 states left on queue.
+Progress(34) at 2024-11-06 15:55:51: 734,548,637 states generated (29,335,518 s/min), 79,865,504 distinct states found (3,114,148 ds/min), 25,270,867 states left on queue.
+Progress(34) at 2024-11-06 15:56:51: 763,724,351 states generated (29,175,714 s/min), 82,969,406 distinct states found (3,103,902 ds/min), 26,203,099 states left on queue.
+Progress(34) at 2024-11-06 15:57:51: 792,795,916 states generated (29,071,565 s/min), 86,092,913 distinct states found (3,123,507 ds/min), 27,124,641 states left on queue.
+Progress(34) at 2024-11-06 15:58:51: 822,084,221 states generated (29,288,305 s/min), 89,196,548 distinct states found (3,103,635 ds/min), 28,028,058 states left on queue.
+Progress(34) at 2024-11-06 15:59:51: 851,516,510 states generated (29,432,289 s/min), 92,135,078 distinct states found (2,938,530 ds/min), 28,822,750 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 16:00:51)
+Progress(34) at 2024-11-06 16:00:51: 880,891,436 states generated (29,374,926 s/min), 95,133,622 distinct states found (2,998,544 ds/min), 29,669,470 states left on queue.
+Progress(34) at 2024-11-06 16:01:51: 910,262,536 states generated (29,371,100 s/min), 98,019,631 distinct states found (2,886,009 ds/min), 30,433,293 states left on queue.
+Progress(34) at 2024-11-06 16:02:51: 939,689,255 states generated (29,426,719 s/min), 100,814,884 distinct states found (2,795,253 ds/min), 31,083,132 states left on queue.
+Progress(34) at 2024-11-06 16:03:51: 969,299,651 states generated (29,610,396 s/min), 103,664,772 distinct states found (2,849,888 ds/min), 31,821,093 states left on queue.
+Progress(34) at 2024-11-06 16:04:51: 999,051,292 states generated (29,751,641 s/min), 106,544,287 distinct states found (2,879,515 ds/min), 32,536,946 states left on queue.
+Progress(35) at 2024-11-06 16:05:51: 1,028,690,576 states generated (29,639,284 s/min), 109,444,362 distinct states found (2,900,075 ds/min), 33,326,316 states left on queue.
+Progress(35) at 2024-11-06 16:06:51: 1,058,155,400 states generated (29,464,824 s/min), 112,439,937 distinct states found (2,995,575 ds/min), 34,167,604 states left on queue.
+Progress(35) at 2024-11-06 16:07:51: 1,087,496,744 states generated (29,341,344 s/min), 115,461,649 distinct states found (3,021,712 ds/min), 35,032,974 states left on queue.
+Progress(35) at 2024-11-06 16:08:51: 1,116,663,767 states generated (29,167,023 s/min), 118,482,838 distinct states found (3,021,189 ds/min), 35,902,651 states left on queue.
+Progress(35) at 2024-11-06 16:09:51: 1,145,439,918 states generated (28,776,151 s/min), 121,562,159 distinct states found (3,079,321 ds/min), 36,785,088 states left on queue.
+Progress(35) at 2024-11-06 16:10:51: 1,174,812,354 states generated (29,372,436 s/min), 124,511,721 distinct states found (2,949,562 ds/min), 37,555,204 states left on queue.
+Progress(35) at 2024-11-06 16:11:51: 1,204,150,178 states generated (29,337,824 s/min), 127,579,155 distinct states found (3,067,434 ds/min), 38,425,790 states left on queue.
+Progress(35) at 2024-11-06 16:12:51: 1,233,620,353 states generated (29,470,175 s/min), 130,490,427 distinct states found (2,911,272 ds/min), 39,188,412 states left on queue.
+Progress(35) at 2024-11-06 16:13:51: 1,263,022,331 states generated (29,401,978 s/min), 133,317,160 distinct states found (2,826,733 ds/min), 39,893,070 states left on queue.
+Progress(35) at 2024-11-06 16:14:51: 1,292,411,979 states generated (29,389,648 s/min), 136,229,817 distinct states found (2,912,657 ds/min), 40,666,029 states left on queue.
+Progress(35) at 2024-11-06 16:15:51: 1,321,695,856 states generated (29,283,877 s/min), 139,081,910 distinct states found (2,852,093 ds/min), 41,389,715 states left on queue.
+Progress(35) at 2024-11-06 16:16:51: 1,351,045,560 states generated (29,349,704 s/min), 141,811,662 distinct states found (2,729,752 ds/min), 41,999,267 states left on queue.
+Progress(35) at 2024-11-06 16:17:51: 1,380,677,436 states generated (29,631,876 s/min), 144,516,072 distinct states found (2,704,410 ds/min), 42,579,779 states left on queue.
+Progress(35) at 2024-11-06 16:18:51: 1,410,332,660 states generated (29,655,224 s/min), 147,269,848 distinct states found (2,753,776 ds/min), 43,232,732 states left on queue.
+Progress(35) at 2024-11-06 16:19:51: 1,440,071,594 states generated (29,738,934 s/min), 150,116,683 distinct states found (2,846,835 ds/min), 43,917,859 states left on queue.
+Progress(35) at 2024-11-06 16:20:51: 1,469,737,942 states generated (29,666,348 s/min), 152,881,605 distinct states found (2,764,922 ds/min), 44,594,909 states left on queue.
+Progress(36) at 2024-11-06 16:21:51: 1,499,124,482 states generated (29,386,540 s/min), 155,722,313 distinct states found (2,840,708 ds/min), 45,306,186 states left on queue.
+Progress(36) at 2024-11-06 16:22:51: 1,528,616,635 states generated (29,492,153 s/min), 158,643,911 distinct states found (2,921,598 ds/min), 46,098,600 states left on queue.
+Progress(36) at 2024-11-06 16:23:51: 1,557,820,328 states generated (29,203,693 s/min), 161,651,516 distinct states found (3,007,605 ds/min), 46,958,572 states left on queue.
+Progress(36) at 2024-11-06 16:24:51: 1,587,341,565 states generated (29,521,237 s/min), 164,469,424 distinct states found (2,817,908 ds/min), 47,648,932 states left on queue.
+Progress(36) at 2024-11-06 16:25:51: 1,616,246,807 states generated (28,905,242 s/min), 167,471,199 distinct states found (3,001,775 ds/min), 48,496,844 states left on queue.
+Progress(36) at 2024-11-06 16:26:51: 1,645,107,613 states generated (28,860,806 s/min), 170,454,103 distinct states found (2,982,904 ds/min), 49,283,244 states left on queue.
+Progress(36) at 2024-11-06 16:27:51: 1,674,492,314 states generated (29,384,701 s/min), 173,343,045 distinct states found (2,888,942 ds/min), 50,006,895 states left on queue.
+Progress(36) at 2024-11-06 16:28:51: 1,703,875,027 states generated (29,382,713 s/min), 176,157,623 distinct states found (2,814,578 ds/min), 50,662,128 states left on queue.
+Progress(36) at 2024-11-06 16:29:51: 1,733,099,131 states generated (29,224,104 s/min), 179,186,519 distinct states found (3,028,896 ds/min), 51,498,029 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 16:30:52)
+Progress(36) at 2024-11-06 16:30:52: 1,762,724,622 states generated (29,625,491 s/min), 181,958,595 distinct states found (2,772,076 ds/min), 52,142,450 states left on queue.
+Progress(36) at 2024-11-06 16:31:52: 1,792,118,288 states generated (29,393,666 s/min), 184,725,090 distinct states found (2,766,495 ds/min), 52,785,705 states left on queue.
+Progress(36) at 2024-11-06 16:32:52: 1,821,258,069 states generated (29,139,781 s/min), 187,681,452 distinct states found (2,956,362 ds/min), 53,592,610 states left on queue.
+Progress(36) at 2024-11-06 16:33:52: 1,850,729,054 states generated (29,470,985 s/min), 190,451,722 distinct states found (2,770,270 ds/min), 54,239,919 states left on queue.
+Progress(36) at 2024-11-06 16:34:52: 1,879,860,913 states generated (29,131,859 s/min), 193,207,770 distinct states found (2,756,048 ds/min), 54,886,748 states left on queue.
+Progress(36) at 2024-11-06 16:35:52: 1,909,200,565 states generated (29,339,652 s/min), 195,832,123 distinct states found (2,624,353 ds/min), 55,404,535 states left on queue.
+Progress(36) at 2024-11-06 16:36:52: 1,938,403,873 states generated (29,203,308 s/min), 198,569,916 distinct states found (2,737,793 ds/min), 55,993,675 states left on queue.
+Progress(36) at 2024-11-06 16:37:52: 1,968,097,695 states generated (29,693,822 s/min), 201,148,799 distinct states found (2,578,883 ds/min), 56,501,179 states left on queue.
+Progress(36) at 2024-11-06 16:38:52: 1,997,628,304 states generated (29,530,609 s/min), 203,860,765 distinct states found (2,711,966 ds/min), 57,133,283 states left on queue.
+Progress(36) at 2024-11-06 16:39:52: 2,027,338,755 states generated (29,710,451 s/min), 206,496,491 distinct states found (2,635,726 ds/min), 57,649,914 states left on queue.
+Progress(36) at 2024-11-06 16:40:52: 2,057,072,538 states generated (29,733,783 s/min), 209,189,488 distinct states found (2,692,997 ds/min), 58,229,449 states left on queue.
+Progress(36) at 2024-11-06 16:41:52: 2,086,549,250 states generated (29,476,712 s/min), 211,909,869 distinct states found (2,720,381 ds/min), 58,875,611 states left on queue.
+Progress(37) at 2024-11-06 16:42:52: 2,115,953,926 states generated (29,404,676 s/min), 214,630,876 distinct states found (2,721,007 ds/min), 59,494,220 states left on queue.
+Progress(37) at 2024-11-06 16:43:52: 2,145,423,196 states generated (29,469,270 s/min), 217,412,888 distinct states found (2,782,012 ds/min), 60,176,423 states left on queue.
+Progress(37) at 2024-11-06 16:44:52: 2,174,796,796 states generated (29,373,600 s/min), 220,316,140 distinct states found (2,903,252 ds/min), 60,925,815 states left on queue.
+Progress(37) at 2024-11-06 16:45:52: 2,203,907,384 states generated (29,110,588 s/min), 223,255,125 distinct states found (2,938,985 ds/min), 61,739,564 states left on queue.
+Progress(37) at 2024-11-06 16:46:52: 2,233,378,272 states generated (29,470,888 s/min), 225,995,858 distinct states found (2,740,733 ds/min), 62,364,627 states left on queue.
+Progress(37) at 2024-11-06 16:47:52: 2,262,648,334 states generated (29,270,062 s/min), 228,738,653 distinct states found (2,742,795 ds/min), 63,003,155 states left on queue.
+Progress(37) at 2024-11-06 16:48:52: 2,291,309,648 states generated (28,661,314 s/min), 231,720,498 distinct states found (2,981,845 ds/min), 63,816,162 states left on queue.
+Progress(37) at 2024-11-06 16:49:52: 2,320,153,384 states generated (28,843,736 s/min), 234,599,475 distinct states found (2,878,977 ds/min), 64,513,886 states left on queue.
+Progress(37) at 2024-11-06 16:50:52: 2,349,538,907 states generated (29,385,523 s/min), 237,330,640 distinct states found (2,731,165 ds/min), 65,105,576 states left on queue.
+Progress(37) at 2024-11-06 16:51:52: 2,379,015,082 states generated (29,476,175 s/min), 240,064,625 distinct states found (2,733,985 ds/min), 65,704,108 states left on queue.
+Progress(37) at 2024-11-06 16:52:52: 2,408,376,582 states generated (29,361,500 s/min), 242,869,889 distinct states found (2,805,264 ds/min), 66,339,299 states left on queue.
+Progress(37) at 2024-11-06 16:53:52: 2,437,554,516 states generated (29,177,934 s/min), 245,844,106 distinct states found (2,974,217 ds/min), 67,125,834 states left on queue.
+Progress(37) at 2024-11-06 16:54:52: 2,466,925,193 states generated (29,370,677 s/min), 248,540,587 distinct states found (2,696,481 ds/min), 67,707,623 states left on queue.
+Progress(37) at 2024-11-06 16:55:52: 2,496,386,977 states generated (29,461,784 s/min), 251,318,893 distinct states found (2,778,306 ds/min), 68,345,796 states left on queue.
+Progress(37) at 2024-11-06 16:56:52: 2,525,837,965 states generated (29,450,988 s/min), 253,918,986 distinct states found (2,600,093 ds/min), 68,851,521 states left on queue.
+Progress(37) at 2024-11-06 16:57:52: 2,555,073,687 states generated (29,235,722 s/min), 256,806,753 distinct states found (2,887,767 ds/min), 69,596,597 states left on queue.
+Progress(37) at 2024-11-06 16:58:52: 2,584,381,294 states generated (29,307,607 s/min), 259,714,054 distinct states found (2,907,301 ds/min), 70,335,539 states left on queue.
+Progress(37) at 2024-11-06 16:59:52: 2,613,557,081 states generated (29,175,787 s/min), 262,407,462 distinct states found (2,693,408 ds/min), 70,920,265 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 17:00:53)
+Progress(37) at 2024-11-06 17:00:53: 2,643,168,141 states generated (29,611,060 s/min), 264,973,171 distinct states found (2,565,709 ds/min), 71,384,749 states left on queue.
+Progress(37) at 2024-11-06 17:01:53: 2,672,453,868 states generated (29,285,727 s/min), 267,551,971 distinct states found (2,578,800 ds/min), 71,854,220 states left on queue.
+Progress(37) at 2024-11-06 17:02:53: 2,701,696,399 states generated (29,242,531 s/min), 270,233,135 distinct states found (2,681,164 ds/min), 72,406,567 states left on queue.
+Progress(37) at 2024-11-06 17:03:53: 2,731,216,488 states generated (29,520,089 s/min), 272,711,390 distinct states found (2,478,255 ds/min), 72,805,269 states left on queue.
+Progress(37) at 2024-11-06 17:04:53: 2,760,788,758 states generated (29,572,270 s/min), 275,307,217 distinct states found (2,595,827 ds/min), 73,313,123 states left on queue.
+Progress(37) at 2024-11-06 17:05:53: 2,790,339,552 states generated (29,550,794 s/min), 277,881,113 distinct states found (2,573,896 ds/min), 73,833,900 states left on queue.
+Progress(37) at 2024-11-06 17:06:53: 2,820,046,206 states generated (29,706,654 s/min), 280,371,086 distinct states found (2,489,973 ds/min), 74,231,258 states left on queue.
+Progress(37) at 2024-11-06 17:07:53: 2,849,787,753 states generated (29,741,547 s/min), 283,097,131 distinct states found (2,726,045 ds/min), 74,814,735 states left on queue.
+Progress(37) at 2024-11-06 17:08:53: 2,879,520,949 states generated (29,733,196 s/min), 285,608,053 distinct states found (2,510,922 ds/min), 75,293,894 states left on queue.
+Progress(37) at 2024-11-06 17:09:53: 2,908,889,760 states generated (29,368,811 s/min), 288,274,872 distinct states found (2,666,819 ds/min), 75,880,480 states left on queue.
+Progress(38) at 2024-11-06 17:10:53: 2,938,412,523 states generated (29,522,763 s/min), 290,877,598 distinct states found (2,602,726 ds/min), 76,391,156 states left on queue.
+Progress(38) at 2024-11-06 17:11:53: 2,967,963,455 states generated (29,550,932 s/min), 293,492,146 distinct states found (2,614,548 ds/min), 76,932,124 states left on queue.
+Progress(38) at 2024-11-06 17:12:53: 2,997,327,370 states generated (29,363,915 s/min), 296,353,306 distinct states found (2,861,160 ds/min), 77,659,606 states left on queue.
+Progress(38) at 2024-11-06 17:13:53: 3,026,713,138 states generated (29,385,768 s/min), 299,173,963 distinct states found (2,820,657 ds/min), 78,342,645 states left on queue.
+Progress(38) at 2024-11-06 17:14:53: 3,055,986,492 states generated (29,273,354 s/min), 302,024,049 distinct states found (2,850,086 ds/min), 79,071,501 states left on queue.
+Progress(38) at 2024-11-06 17:15:53: 3,085,491,974 states generated (29,505,482 s/min), 304,668,970 distinct states found (2,644,921 ds/min), 79,608,084 states left on queue.
+Progress(38) at 2024-11-06 17:16:53: 3,114,898,266 states generated (29,406,292 s/min), 307,272,526 distinct states found (2,603,556 ds/min), 80,132,575 states left on queue.
+Progress(38) at 2024-11-06 17:17:53: 3,144,023,490 states generated (29,125,224 s/min), 310,022,073 distinct states found (2,749,547 ds/min), 80,777,238 states left on queue.
+Progress(38) at 2024-11-06 17:18:53: 3,172,762,795 states generated (28,739,305 s/min), 312,891,905 distinct states found (2,869,832 ds/min), 81,497,739 states left on queue.
+Progress(38) at 2024-11-06 17:19:53: 3,201,314,425 states generated (28,551,630 s/min), 315,766,566 distinct states found (2,874,661 ds/min), 82,171,729 states left on queue.
+Progress(38) at 2024-11-06 17:20:53: 3,230,713,777 states generated (29,399,352 s/min), 318,365,612 distinct states found (2,599,046 ds/min), 82,638,018 states left on queue.
+Progress(38) at 2024-11-06 17:21:53: 3,260,188,634 states generated (29,474,857 s/min), 321,040,810 distinct states found (2,675,198 ds/min), 83,185,708 states left on queue.
+Progress(38) at 2024-11-06 17:22:53: 3,289,654,456 states generated (29,465,822 s/min), 323,660,313 distinct states found (2,619,503 ds/min), 83,689,075 states left on queue.
+Progress(38) at 2024-11-06 17:23:53: 3,319,003,677 states generated (29,349,221 s/min), 326,391,347 distinct states found (2,731,034 ds/min), 84,261,368 states left on queue.
+Progress(38) at 2024-11-06 17:24:53: 3,348,330,685 states generated (29,327,008 s/min), 329,204,934 distinct states found (2,813,587 ds/min), 84,925,046 states left on queue.
+Progress(38) at 2024-11-06 17:25:53: 3,377,572,946 states generated (29,242,261 s/min), 331,997,887 distinct states found (2,792,953 ds/min), 85,533,473 states left on queue.
+Progress(38) at 2024-11-06 17:26:53: 3,406,881,714 states generated (29,308,768 s/min), 334,599,745 distinct states found (2,601,858 ds/min), 86,047,276 states left on queue.
+Progress(38) at 2024-11-06 17:27:53: 3,436,375,389 states generated (29,493,675 s/min), 337,261,572 distinct states found (2,661,827 ds/min), 86,591,357 states left on queue.
+Progress(38) at 2024-11-06 17:28:53: 3,465,811,732 states generated (29,436,343 s/min), 339,829,613 distinct states found (2,568,041 ds/min), 87,057,550 states left on queue.
+Progress(38) at 2024-11-06 17:29:53: 3,495,144,983 states generated (29,333,251 s/min), 342,566,275 distinct states found (2,736,662 ds/min), 87,671,131 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 17:30:53)
+Progress(38) at 2024-11-06 17:30:53: 3,524,611,246 states generated (29,466,263 s/min), 345,366,358 distinct states found (2,800,083 ds/min), 88,316,673 states left on queue.
+Progress(38) at 2024-11-06 17:31:53: 3,553,819,331 states generated (29,208,085 s/min), 348,291,666 distinct states found (2,925,308 ds/min), 89,059,679 states left on queue.
+Progress(38) at 2024-11-06 17:32:53: 3,583,208,821 states generated (29,389,490 s/min), 350,796,636 distinct states found (2,504,970 ds/min), 89,478,521 states left on queue.
+Progress(38) at 2024-11-06 17:33:53: 3,612,329,910 states generated (29,121,089 s/min), 353,414,448 distinct states found (2,617,812 ds/min), 90,008,568 states left on queue.
+Progress(38) at 2024-11-06 17:34:53: 3,641,485,253 states generated (29,155,343 s/min), 356,010,441 distinct states found (2,595,993 ds/min), 90,486,313 states left on queue.
+Progress(38) at 2024-11-06 17:35:53: 3,670,761,645 states generated (29,276,392 s/min), 358,411,973 distinct states found (2,401,532 ds/min), 90,799,029 states left on queue.
+Progress(38) at 2024-11-06 17:36:53: 3,700,008,207 states generated (29,246,562 s/min), 360,943,422 distinct states found (2,531,449 ds/min), 91,235,694 states left on queue.
+Progress(38) at 2024-11-06 17:37:53: 3,729,045,761 states generated (29,037,554 s/min), 363,523,499 distinct states found (2,580,077 ds/min), 91,685,579 states left on queue.
+Progress(38) at 2024-11-06 17:38:53: 3,758,697,262 states generated (29,651,501 s/min), 365,860,396 distinct states found (2,336,897 ds/min), 92,003,313 states left on queue.
+Progress(38) at 2024-11-06 17:39:53: 3,788,188,489 states generated (29,491,227 s/min), 368,369,398 distinct states found (2,509,002 ds/min), 92,452,083 states left on queue.
+Progress(38) at 2024-11-06 17:40:53: 3,817,718,772 states generated (29,530,283 s/min), 370,855,965 distinct states found (2,486,567 ds/min), 92,899,812 states left on queue.
+Progress(38) at 2024-11-06 17:41:53: 3,847,372,748 states generated (29,653,976 s/min), 373,231,774 distinct states found (2,375,809 ds/min), 93,202,503 states left on queue.
+Progress(38) at 2024-11-06 17:42:53: 3,877,091,950 states generated (29,719,202 s/min), 375,934,374 distinct states found (2,702,600 ds/min), 93,775,105 states left on queue.
+Progress(38) at 2024-11-06 17:43:53: 3,906,843,295 states generated (29,751,345 s/min), 378,304,497 distinct states found (2,370,123 ds/min), 94,098,611 states left on queue.
+Progress(38) at 2024-11-06 17:44:53: 3,936,304,033 states generated (29,460,738 s/min), 380,793,774 distinct states found (2,489,277 ds/min), 94,560,398 states left on queue.
+Progress(38) at 2024-11-06 17:45:53: 3,965,687,311 states generated (29,383,278 s/min), 383,366,376 distinct states found (2,572,602 ds/min), 95,062,163 states left on queue.
+Progress(38) at 2024-11-06 17:46:53: 3,995,264,758 states generated (29,577,447 s/min), 385,832,314 distinct states found (2,465,938 ds/min), 95,460,777 states left on queue.
+Progress(38) at 2024-11-06 17:47:53: 4,024,519,333 states generated (29,254,575 s/min), 388,384,282 distinct states found (2,551,968 ds/min), 95,931,698 states left on queue.
+Progress(38) at 2024-11-06 17:48:53: 4,054,053,752 states generated (29,534,419 s/min), 390,990,581 distinct states found (2,606,299 ds/min), 96,493,705 states left on queue.
+Progress(38) at 2024-11-06 17:49:53: 4,083,403,606 states generated (29,349,854 s/min), 393,717,328 distinct states found (2,726,747 ds/min), 97,099,592 states left on queue.
+Progress(38) at 2024-11-06 17:50:53: 4,112,753,694 states generated (29,350,088 s/min), 396,441,909 distinct states found (2,724,581 ds/min), 97,694,523 states left on queue.
+Progress(38) at 2024-11-06 17:51:53: 4,141,940,951 states generated (29,187,257 s/min), 399,238,612 distinct states found (2,796,703 ds/min), 98,387,103 states left on queue.
+Progress(38) at 2024-11-06 17:52:53: 4,171,185,273 states generated (29,244,322 s/min), 401,861,376 distinct states found (2,622,764 ds/min), 98,900,168 states left on queue.
+Progress(38) at 2024-11-06 17:53:53: 4,200,735,055 states generated (29,549,782 s/min), 404,419,627 distinct states found (2,558,251 ds/min), 99,388,507 states left on queue.
+Progress(38) at 2024-11-06 17:54:53: 4,230,057,902 states generated (29,322,847 s/min), 406,926,477 distinct states found (2,506,850 ds/min), 99,826,562 states left on queue.
+Progress(38) at 2024-11-06 17:55:53: 4,259,279,515 states generated (29,221,613 s/min), 409,512,606 distinct states found (2,586,129 ds/min), 100,340,214 states left on queue.
+Progress(38) at 2024-11-06 17:56:53: 4,288,265,663 states generated (28,986,148 s/min), 412,254,402 distinct states found (2,741,796 ds/min), 100,966,036 states left on queue.
+Progress(38) at 2024-11-06 17:57:53: 4,316,798,413 states generated (28,532,750 s/min), 415,047,481 distinct states found (2,793,079 ds/min), 101,589,869 states left on queue.
+Progress(38) at 2024-11-06 17:58:53: 4,345,527,290 states generated (28,728,877 s/min), 417,768,588 distinct states found (2,721,107 ds/min), 102,133,503 states left on queue.
+Progress(38) at 2024-11-06 17:59:53: 4,374,924,942 states generated (29,397,652 s/min), 420,254,082 distinct states found (2,485,494 ds/min), 102,500,461 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 18:00:54)
+Progress(38) at 2024-11-06 18:00:54: 4,404,604,911 states generated (29,679,969 s/min), 422,801,691 distinct states found (2,547,609 ds/min), 102,936,440 states left on queue.
+Progress(38) at 2024-11-06 18:01:54: 4,434,018,901 states generated (29,413,990 s/min), 425,477,119 distinct states found (2,675,428 ds/min), 103,472,987 states left on queue.
+Progress(38) at 2024-11-06 18:02:54: 4,463,498,297 states generated (29,479,396 s/min), 427,949,289 distinct states found (2,472,170 ds/min), 103,858,839 states left on queue.
+Progress(38) at 2024-11-06 18:03:54: 4,492,775,931 states generated (29,277,634 s/min), 430,592,094 distinct states found (2,642,805 ds/min), 104,353,609 states left on queue.
+Progress(38) at 2024-11-06 18:04:54: 4,522,002,300 states generated (29,226,369 s/min), 433,322,584 distinct states found (2,730,490 ds/min), 104,949,753 states left on queue.
+Progress(38) at 2024-11-06 18:05:54: 4,551,375,180 states generated (29,372,880 s/min), 436,005,138 distinct states found (2,682,554 ds/min), 105,482,546 states left on queue.
+Progress(38) at 2024-11-06 18:06:54: 4,580,718,169 states generated (29,342,989 s/min), 438,516,579 distinct states found (2,511,441 ds/min), 105,868,435 states left on queue.
+Progress(38) at 2024-11-06 18:07:54: 4,609,859,344 states generated (29,141,175 s/min), 441,134,700 distinct states found (2,618,121 ds/min), 106,390,335 states left on queue.
+Progress(38) at 2024-11-06 18:08:54: 4,639,331,150 states generated (29,471,806 s/min), 443,662,679 distinct states found (2,527,979 ds/min), 106,821,264 states left on queue.
+Progress(38) at 2024-11-06 18:09:54: 4,668,696,820 states generated (29,365,670 s/min), 446,222,969 distinct states found (2,560,290 ds/min), 107,277,508 states left on queue.
+Progress(38) at 2024-11-06 18:10:54: 4,698,140,829 states generated (29,444,009 s/min), 448,693,022 distinct states found (2,470,053 ds/min), 107,654,262 states left on queue.
+Progress(38) at 2024-11-06 18:11:54: 4,727,380,985 states generated (29,240,156 s/min), 451,459,276 distinct states found (2,766,254 ds/min), 108,284,101 states left on queue.
+Progress(38) at 2024-11-06 18:12:54: 4,756,654,088 states generated (29,273,103 s/min), 454,180,180 distinct states found (2,720,904 ds/min), 108,879,205 states left on queue.
+Progress(38) at 2024-11-06 18:13:54: 4,785,893,104 states generated (29,239,016 s/min), 457,001,077 distinct states found (2,820,897 ds/min), 109,511,015 states left on queue.
+Progress(38) at 2024-11-06 18:14:54: 4,815,289,339 states generated (29,396,235 s/min), 459,530,340 distinct states found (2,529,263 ds/min), 109,951,588 states left on queue.
+Progress(38) at 2024-11-06 18:15:54: 4,844,354,767 states generated (29,065,428 s/min), 462,144,567 distinct states found (2,614,227 ds/min), 110,455,692 states left on queue.
+Progress(38) at 2024-11-06 18:16:54: 4,873,381,465 states generated (29,026,698 s/min), 464,718,128 distinct states found (2,573,561 ds/min), 110,936,992 states left on queue.
+Progress(38) at 2024-11-06 18:17:54: 4,902,616,179 states generated (29,234,714 s/min), 467,171,620 distinct states found (2,453,492 ds/min), 111,288,450 states left on queue.
+Progress(38) at 2024-11-06 18:18:54: 4,931,808,383 states generated (29,192,204 s/min), 469,593,253 distinct states found (2,421,633 ds/min), 111,607,240 states left on queue.
+Progress(38) at 2024-11-06 18:19:54: 4,961,319,800 states generated (29,511,417 s/min), 471,795,067 distinct states found (2,201,814 ds/min), 111,770,077 states left on queue.
+Progress(38) at 2024-11-06 18:20:54: 4,990,051,892 states generated (28,732,092 s/min), 474,595,717 distinct states found (2,800,650 ds/min), 112,380,795 states left on queue.
+Progress(38) at 2024-11-06 18:21:54: 5,019,620,389 states generated (29,568,497 s/min), 476,860,178 distinct states found (2,264,461 ds/min), 112,610,789 states left on queue.
+Progress(38) at 2024-11-06 18:22:54: 5,049,176,225 states generated (29,555,836 s/min), 479,117,000 distinct states found (2,256,822 ds/min), 112,849,809 states left on queue.
+Progress(38) at 2024-11-06 18:23:54: 5,078,659,511 states generated (29,483,286 s/min), 481,552,566 distinct states found (2,435,566 ds/min), 113,238,679 states left on queue.
+Progress(38) at 2024-11-06 18:24:54: 5,108,186,428 states generated (29,526,917 s/min), 483,970,290 distinct states found (2,417,724 ds/min), 113,645,974 states left on queue.
+Progress(38) at 2024-11-06 18:25:54: 5,137,766,496 states generated (29,580,068 s/min), 486,204,445 distinct states found (2,234,155 ds/min), 113,816,273 states left on queue.
+Progress(38) at 2024-11-06 18:26:54: 5,167,429,477 states generated (29,662,981 s/min), 488,726,479 distinct states found (2,522,034 ds/min), 114,265,425 states left on queue.
+Progress(38) at 2024-11-06 18:27:54: 5,197,227,715 states generated (29,798,238 s/min), 491,213,848 distinct states found (2,487,369 ds/min), 114,645,624 states left on queue.
+Progress(38) at 2024-11-06 18:28:54: 5,226,883,420 states generated (29,655,705 s/min), 493,480,968 distinct states found (2,267,120 ds/min), 114,901,786 states left on queue.
+Progress(38) at 2024-11-06 18:29:54: 5,256,355,905 states generated (29,472,485 s/min), 495,866,549 distinct states found (2,385,581 ds/min), 115,277,276 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 18:30:55)
+Progress(38) at 2024-11-06 18:30:55: 5,286,035,252 states generated (29,679,347 s/min), 498,324,679 distinct states found (2,458,130 ds/min), 115,663,015 states left on queue.
+Progress(38) at 2024-11-06 18:31:55: 5,315,467,724 states generated (29,432,472 s/min), 500,723,577 distinct states found (2,398,898 ds/min), 116,023,619 states left on queue.
+Progress(38) at 2024-11-06 18:32:55: 5,344,728,453 states generated (29,260,729 s/min), 503,156,876 distinct states found (2,433,299 ds/min), 116,384,801 states left on queue.
+Progress(38) at 2024-11-06 18:33:55: 5,374,055,231 states generated (29,326,778 s/min), 505,588,957 distinct states found (2,432,081 ds/min), 116,786,679 states left on queue.
+Progress(38) at 2024-11-06 18:34:55: 5,403,566,278 states generated (29,511,047 s/min), 508,096,703 distinct states found (2,507,746 ds/min), 117,258,425 states left on queue.
+Progress(38) at 2024-11-06 18:35:55: 5,432,770,932 states generated (29,204,654 s/min), 510,765,370 distinct states found (2,668,667 ds/min), 117,821,443 states left on queue.
+Progress(38) at 2024-11-06 18:36:55: 5,462,325,607 states generated (29,554,675 s/min), 513,306,027 distinct states found (2,540,657 ds/min), 118,252,946 states left on queue.
+Progress(38) at 2024-11-06 18:37:55: 5,491,531,381 states generated (29,205,774 s/min), 516,017,383 distinct states found (2,711,356 ds/min), 118,857,035 states left on queue.
+Progress(38) at 2024-11-06 18:38:55: 5,520,744,572 states generated (29,213,191 s/min), 518,696,783 distinct states found (2,679,400 ds/min), 119,445,954 states left on queue.
+Progress(38) at 2024-11-06 18:39:55: 5,549,903,819 states generated (29,159,247 s/min), 521,329,662 distinct states found (2,632,879 ds/min), 119,977,569 states left on queue.
+Progress(38) at 2024-11-06 18:40:55: 5,579,474,839 states generated (29,571,020 s/min), 523,702,578 distinct states found (2,372,916 ds/min), 120,289,041 states left on queue.
+Progress(38) at 2024-11-06 18:41:55: 5,608,757,550 states generated (29,282,711 s/min), 526,191,629 distinct states found (2,489,051 ds/min), 120,719,632 states left on queue.
+Progress(38) at 2024-11-06 18:42:55: 5,638,085,090 states generated (29,327,540 s/min), 528,478,505 distinct states found (2,286,876 ds/min), 120,990,568 states left on queue.
+Progress(38) at 2024-11-06 18:43:55: 5,667,141,833 states generated (29,056,743 s/min), 531,035,593 distinct states found (2,557,088 ds/min), 121,480,763 states left on queue.
+Progress(38) at 2024-11-06 18:44:55: 5,696,139,104 states generated (28,997,271 s/min), 533,684,330 distinct states found (2,648,737 ds/min), 122,027,516 states left on queue.
+Progress(38) at 2024-11-06 18:45:55: 5,724,868,902 states generated (28,729,798 s/min), 536,316,715 distinct states found (2,632,385 ds/min), 122,548,317 states left on queue.
+Progress(38) at 2024-11-06 18:46:55: 5,753,438,871 states generated (28,569,969 s/min), 539,001,028 distinct states found (2,684,313 ds/min), 123,041,578 states left on queue.
+Progress(38) at 2024-11-06 18:47:55: 5,782,391,778 states generated (28,952,907 s/min), 541,537,259 distinct states found (2,536,231 ds/min), 123,436,184 states left on queue.
+Progress(38) at 2024-11-06 18:48:55: 5,811,823,996 states generated (29,432,218 s/min), 543,896,432 distinct states found (2,359,173 ds/min), 123,698,698 states left on queue.
+Progress(38) at 2024-11-06 18:49:55: 5,841,258,941 states generated (29,434,945 s/min), 546,273,191 distinct states found (2,376,759 ds/min), 124,012,754 states left on queue.
+Progress(38) at 2024-11-06 18:50:55: 5,870,667,995 states generated (29,409,054 s/min), 548,835,686 distinct states found (2,562,495 ds/min), 124,450,482 states left on queue.
+Progress(38) at 2024-11-06 18:51:55: 5,900,038,718 states generated (29,370,723 s/min), 551,304,457 distinct states found (2,468,771 ds/min), 124,805,220 states left on queue.
+Progress(38) at 2024-11-06 18:52:55: 5,929,442,421 states generated (29,403,703 s/min), 553,776,296 distinct states found (2,471,839 ds/min), 125,178,608 states left on queue.
+Progress(38) at 2024-11-06 18:53:55: 5,958,838,496 states generated (29,396,075 s/min), 556,289,762 distinct states found (2,513,466 ds/min), 125,588,158 states left on queue.
+Progress(38) at 2024-11-06 18:54:55: 5,988,187,325 states generated (29,348,829 s/min), 558,898,224 distinct states found (2,608,462 ds/min), 126,074,377 states left on queue.
+Progress(38) at 2024-11-06 18:55:55: 6,017,546,111 states generated (29,358,786 s/min), 561,530,468 distinct states found (2,632,244 ds/min), 126,579,784 states left on queue.
+Progress(38) at 2024-11-06 18:56:55: 6,046,777,143 states generated (29,231,032 s/min), 564,182,546 distinct states found (2,652,078 ds/min), 127,037,883 states left on queue.
+Progress(39) at 2024-11-06 18:57:55: 6,076,111,479 states generated (29,334,336 s/min), 566,509,898 distinct states found (2,327,352 ds/min), 127,319,036 states left on queue.
+Progress(39) at 2024-11-06 18:58:55: 6,105,215,668 states generated (29,104,189 s/min), 569,000,954 distinct states found (2,491,056 ds/min), 127,724,185 states left on queue.
+Progress(39) at 2024-11-06 18:59:55: 6,134,619,650 states generated (29,403,982 s/min), 571,444,199 distinct states found (2,443,245 ds/min), 128,083,849 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 19:00:55)
+Progress(39) at 2024-11-06 19:00:55: 6,164,303,226 states generated (29,683,576 s/min), 574,046,920 distinct states found (2,602,721 ds/min), 128,537,330 states left on queue.
+Progress(39) at 2024-11-06 19:01:55: 6,193,710,515 states generated (29,407,289 s/min), 576,294,161 distinct states found (2,247,241 ds/min), 128,749,186 states left on queue.
+Progress(39) at 2024-11-06 19:02:55: 6,223,050,437 states generated (29,339,922 s/min), 578,840,811 distinct states found (2,546,650 ds/min), 129,198,375 states left on queue.
+Progress(39) at 2024-11-06 19:03:55: 6,252,273,339 states generated (29,222,902 s/min), 581,530,481 distinct states found (2,689,670 ds/min), 129,745,195 states left on queue.
+Progress(39) at 2024-11-06 19:04:55: 6,281,535,213 states generated (29,261,874 s/min), 584,206,969 distinct states found (2,676,488 ds/min), 130,306,182 states left on queue.
+Progress(39) at 2024-11-06 19:05:55: 6,310,569,147 states generated (29,033,934 s/min), 587,031,959 distinct states found (2,824,990 ds/min), 130,922,629 states left on queue.
+Progress(39) at 2024-11-06 19:06:55: 6,339,951,741 states generated (29,382,594 s/min), 589,709,668 distinct states found (2,677,709 ds/min), 131,483,555 states left on queue.
+Progress(39) at 2024-11-06 19:07:55: 6,369,354,481 states generated (29,402,740 s/min), 591,964,654 distinct states found (2,254,986 ds/min), 131,688,532 states left on queue.
+Progress(39) at 2024-11-06 19:08:55: 6,398,254,591 states generated (28,900,110 s/min), 594,604,924 distinct states found (2,640,270 ds/min), 132,195,069 states left on queue.
+Progress(39) at 2024-11-06 19:09:55: 6,427,422,756 states generated (29,168,165 s/min), 597,059,083 distinct states found (2,454,159 ds/min), 132,571,626 states left on queue.
+Progress(39) at 2024-11-06 19:10:55: 6,456,469,721 states generated (29,046,965 s/min), 599,400,317 distinct states found (2,341,234 ds/min), 132,826,474 states left on queue.
+Progress(39) at 2024-11-06 19:11:55: 6,485,733,442 states generated (29,263,721 s/min), 602,040,336 distinct states found (2,640,019 ds/min), 133,286,664 states left on queue.
+Progress(39) at 2024-11-06 19:12:55: 6,515,001,998 states generated (29,268,556 s/min), 604,003,958 distinct states found (1,963,622 ds/min), 133,255,252 states left on queue.
+Progress(39) at 2024-11-06 19:13:55: 6,544,172,146 states generated (29,170,148 s/min), 606,473,164 distinct states found (2,469,206 ds/min), 133,627,323 states left on queue.
+Progress(39) at 2024-11-06 19:14:55: 6,572,975,355 states generated (28,803,209 s/min), 609,043,606 distinct states found (2,570,442 ds/min), 134,023,262 states left on queue.
+Progress(39) at 2024-11-06 19:15:55: 6,602,534,934 states generated (29,559,579 s/min), 611,212,652 distinct states found (2,169,046 ds/min), 134,205,070 states left on queue.
+Progress(39) at 2024-11-06 19:16:55: 6,632,044,851 states generated (29,509,917 s/min), 613,377,378 distinct states found (2,164,726 ds/min), 134,360,577 states left on queue.
+Progress(39) at 2024-11-06 19:17:55: 6,661,465,356 states generated (29,420,505 s/min), 615,729,605 distinct states found (2,352,227 ds/min), 134,679,148 states left on queue.
+Progress(39) at 2024-11-06 19:18:55: 6,690,848,776 states generated (29,383,420 s/min), 618,034,126 distinct states found (2,304,521 ds/min), 134,989,999 states left on queue.
+Progress(39) at 2024-11-06 19:19:55: 6,720,362,641 states generated (29,513,865 s/min), 620,264,990 distinct states found (2,230,864 ds/min), 135,213,527 states left on queue.
+Progress(39) at 2024-11-06 19:20:55: 6,749,995,972 states generated (29,633,331 s/min), 622,424,423 distinct states found (2,159,433 ds/min), 135,336,269 states left on queue.
+Progress(39) at 2024-11-06 19:21:55: 6,779,641,479 states generated (29,645,507 s/min), 624,953,002 distinct states found (2,528,579 ds/min), 135,781,717 states left on queue.
+Progress(39) at 2024-11-06 19:22:55: 6,809,496,805 states generated (29,855,326 s/min), 627,297,563 distinct states found (2,344,561 ds/min), 136,040,988 states left on queue.
+Progress(39) at 2024-11-06 19:23:55: 6,839,096,708 states generated (29,599,903 s/min), 629,464,688 distinct states found (2,167,125 ds/min), 136,210,971 states left on queue.
+Progress(39) at 2024-11-06 19:24:55: 6,868,614,311 states generated (29,517,603 s/min), 631,704,627 distinct states found (2,239,939 ds/min), 136,469,731 states left on queue.
+Progress(39) at 2024-11-06 19:25:55: 6,897,932,930 states generated (29,318,619 s/min), 633,961,042 distinct states found (2,256,415 ds/min), 136,714,912 states left on queue.
+Progress(39) at 2024-11-06 19:26:55: 6,927,200,602 states generated (29,267,672 s/min), 636,414,800 distinct states found (2,453,758 ds/min), 137,101,547 states left on queue.
+Progress(39) at 2024-11-06 19:27:55: 6,956,755,074 states generated (29,554,472 s/min), 638,616,489 distinct states found (2,201,689 ds/min), 137,285,238 states left on queue.
+Progress(39) at 2024-11-06 19:28:55: 6,985,926,285 states generated (29,171,211 s/min), 640,970,274 distinct states found (2,353,785 ds/min), 137,592,586 states left on queue.
+Progress(39) at 2024-11-06 19:29:55: 7,015,240,294 states generated (29,314,009 s/min), 643,310,280 distinct states found (2,340,006 ds/min), 137,914,322 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 19:30:56)
+Progress(39) at 2024-11-06 19:30:56: 7,045,112,039 states generated (29,871,745 s/min), 645,650,251 distinct states found (2,339,971 ds/min), 138,248,533 states left on queue.
+Progress(39) at 2024-11-06 19:31:56: 7,074,347,122 states generated (29,235,083 s/min), 648,286,341 distinct states found (2,636,090 ds/min), 138,800,606 states left on queue.
+Progress(39) at 2024-11-06 19:32:56: 7,103,701,427 states generated (29,354,305 s/min), 650,776,754 distinct states found (2,490,413 ds/min), 139,200,935 states left on queue.
+Progress(39) at 2024-11-06 19:33:56: 7,133,125,574 states generated (29,424,147 s/min), 653,222,778 distinct states found (2,446,024 ds/min), 139,553,972 states left on queue.
+Progress(39) at 2024-11-06 19:34:56: 7,162,393,954 states generated (29,268,380 s/min), 655,812,815 distinct states found (2,590,037 ds/min), 140,051,736 states left on queue.
+Progress(39) at 2024-11-06 19:35:56: 7,191,614,309 states generated (29,220,355 s/min), 658,388,779 distinct states found (2,575,964 ds/min), 140,550,430 states left on queue.
+Progress(39) at 2024-11-06 19:36:56: 7,220,841,977 states generated (29,227,668 s/min), 660,885,901 distinct states found (2,497,122 ds/min), 140,973,038 states left on queue.
+Progress(39) at 2024-11-06 19:37:56: 7,250,020,241 states generated (29,178,264 s/min), 663,335,701 distinct states found (2,449,800 ds/min), 141,327,800 states left on queue.
+Progress(39) at 2024-11-06 19:38:56: 7,279,545,923 states generated (29,525,682 s/min), 665,706,252 distinct states found (2,370,551 ds/min), 141,666,628 states left on queue.
+Progress(39) at 2024-11-06 19:39:56: 7,308,806,585 states generated (29,260,662 s/min), 668,059,763 distinct states found (2,353,511 ds/min), 141,985,139 states left on queue.
+Progress(39) at 2024-11-06 19:40:56: 7,338,028,888 states generated (29,222,303 s/min), 670,241,848 distinct states found (2,182,085 ds/min), 142,169,842 states left on queue.
+Progress(39) at 2024-11-06 19:41:56: 7,367,241,753 states generated (29,212,865 s/min), 672,613,255 distinct states found (2,371,407 ds/min), 142,507,724 states left on queue.
+Progress(39) at 2024-11-06 19:42:56: 7,396,269,434 states generated (29,027,681 s/min), 675,112,517 distinct states found (2,499,262 ds/min), 142,941,967 states left on queue.
+Progress(39) at 2024-11-06 19:43:56: 7,425,237,701 states generated (28,968,267 s/min), 677,646,850 distinct states found (2,534,333 ds/min), 143,388,301 states left on queue.
+Progress(39) at 2024-11-06 19:44:56: 7,453,929,312 states generated (28,691,611 s/min), 680,183,486 distinct states found (2,536,636 ds/min), 143,823,998 states left on queue.
+Progress(39) at 2024-11-06 19:45:56: 7,482,605,282 states generated (28,675,970 s/min), 682,751,269 distinct states found (2,567,783 ds/min), 144,211,694 states left on queue.
+Progress(39) at 2024-11-06 19:46:56: 7,511,402,194 states generated (28,796,912 s/min), 685,177,338 distinct states found (2,426,069 ds/min), 144,502,576 states left on queue.
+Progress(39) at 2024-11-06 19:47:56: 7,540,667,315 states generated (29,265,121 s/min), 687,470,422 distinct states found (2,293,084 ds/min), 144,717,485 states left on queue.
+Progress(39) at 2024-11-06 19:48:56: 7,570,065,371 states generated (29,398,056 s/min), 689,724,172 distinct states found (2,253,750 ds/min), 144,895,541 states left on queue.
+Progress(39) at 2024-11-06 19:49:56: 7,599,596,791 states generated (29,531,420 s/min), 692,064,101 distinct states found (2,339,929 ds/min), 145,171,911 states left on queue.
+Progress(39) at 2024-11-06 19:50:56: 7,629,011,363 states generated (29,414,572 s/min), 694,540,161 distinct states found (2,476,060 ds/min), 145,540,423 states left on queue.
+Progress(39) at 2024-11-06 19:51:56: 7,658,453,965 states generated (29,442,602 s/min), 696,912,122 distinct states found (2,371,961 ds/min), 145,809,567 states left on queue.
+Progress(39) at 2024-11-06 19:52:56: 7,687,913,137 states generated (29,459,172 s/min), 699,240,630 distinct states found (2,328,508 ds/min), 146,098,273 states left on queue.
+Progress(39) at 2024-11-06 19:53:56: 7,717,161,254 states generated (29,248,117 s/min), 701,789,915 distinct states found (2,549,285 ds/min), 146,502,121 states left on queue.
+Progress(39) at 2024-11-06 19:54:56: 7,746,587,948 states generated (29,426,694 s/min), 704,037,014 distinct states found (2,247,099 ds/min), 146,684,369 states left on queue.
+Progress(39) at 2024-11-06 19:55:56: 7,775,767,241 states generated (29,179,293 s/min), 706,750,225 distinct states found (2,713,211 ds/min), 147,270,858 states left on queue.
+Progress(39) at 2024-11-06 19:56:56: 7,805,143,313 states generated (29,376,072 s/min), 709,214,940 distinct states found (2,464,715 ds/min), 147,627,166 states left on queue.
+Progress(39) at 2024-11-06 19:57:56: 7,834,403,478 states generated (29,260,165 s/min), 711,759,633 distinct states found (2,544,693 ds/min), 147,996,842 states left on queue.
+Progress(40) at 2024-11-06 19:58:56: 7,863,785,909 states generated (29,382,431 s/min), 713,915,903 distinct states found (2,156,270 ds/min), 148,107,480 states left on queue.
+Progress(40) at 2024-11-06 19:59:56: 7,892,661,923 states generated (28,876,014 s/min), 716,529,052 distinct states found (2,613,149 ds/min), 148,615,346 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 20:00:57)
+Progress(40) at 2024-11-06 20:00:57: 7,922,354,868 states generated (29,692,945 s/min), 718,724,840 distinct states found (2,195,788 ds/min), 148,760,464 states left on queue.
+Progress(40) at 2024-11-06 20:01:57: 7,951,821,345 states generated (29,466,477 s/min), 721,199,790 distinct states found (2,474,950 ds/min), 149,133,458 states left on queue.
+Progress(40) at 2024-11-06 20:02:57: 7,981,212,562 states generated (29,391,217 s/min), 723,637,084 distinct states found (2,437,294 ds/min), 149,453,388 states left on queue.
+Progress(40) at 2024-11-06 20:03:57: 8,010,639,344 states generated (29,426,782 s/min), 725,776,597 distinct states found (2,139,513 ds/min), 149,580,205 states left on queue.
+Progress(40) at 2024-11-06 20:04:57: 8,039,970,078 states generated (29,330,734 s/min), 728,145,896 distinct states found (2,369,299 ds/min), 149,873,787 states left on queue.
+Progress(40) at 2024-11-06 20:05:57: 8,069,221,501 states generated (29,251,423 s/min), 730,835,980 distinct states found (2,690,084 ds/min), 150,431,663 states left on queue.
+Progress(40) at 2024-11-06 20:06:57: 8,098,568,645 states generated (29,347,144 s/min), 733,266,238 distinct states found (2,430,258 ds/min), 150,772,190 states left on queue.
+Progress(40) at 2024-11-06 20:07:57: 8,127,646,970 states generated (29,078,325 s/min), 736,001,441 distinct states found (2,735,203 ds/min), 151,368,297 states left on queue.
+Progress(40) at 2024-11-06 20:08:57: 8,156,755,007 states generated (29,108,037 s/min), 738,759,675 distinct states found (2,758,234 ds/min), 151,912,929 states left on queue.
+Progress(40) at 2024-11-06 20:09:57: 8,186,234,810 states generated (29,479,803 s/min), 741,336,146 distinct states found (2,576,471 ds/min), 152,376,828 states left on queue.
+Progress(40) at 2024-11-06 20:10:57: 8,215,641,994 states generated (29,407,184 s/min), 743,647,353 distinct states found (2,311,207 ds/min), 152,617,899 states left on queue.
+Progress(40) at 2024-11-06 20:11:57: 8,244,746,445 states generated (29,104,451 s/min), 746,080,007 distinct states found (2,432,654 ds/min), 152,939,104 states left on queue.
+Progress(40) at 2024-11-06 20:12:57: 8,273,514,095 states generated (28,767,650 s/min), 748,726,701 distinct states found (2,646,694 ds/min), 153,445,645 states left on queue.
+Progress(40) at 2024-11-06 20:13:57: 8,302,647,011 states generated (29,132,916 s/min), 751,041,420 distinct states found (2,314,719 ds/min), 153,711,631 states left on queue.
+Progress(40) at 2024-11-06 20:14:57: 8,331,785,512 states generated (29,138,501 s/min), 753,262,324 distinct states found (2,220,904 ds/min), 153,861,206 states left on queue.
+Progress(40) at 2024-11-06 20:15:57: 8,361,058,813 states generated (29,273,301 s/min), 755,881,803 distinct states found (2,619,479 ds/min), 154,293,451 states left on queue.
+Progress(40) at 2024-11-06 20:16:57: 8,390,323,842 states generated (29,265,029 s/min), 757,769,813 distinct states found (1,888,010 ds/min), 154,184,183 states left on queue.
+Progress(40) at 2024-11-06 20:17:57: 8,419,579,524 states generated (29,255,682 s/min), 760,009,795 distinct states found (2,239,982 ds/min), 154,382,656 states left on queue.
+Progress(40) at 2024-11-06 20:18:57: 8,448,394,343 states generated (28,814,819 s/min), 762,597,225 distinct states found (2,587,430 ds/min), 154,795,314 states left on queue.
+Progress(40) at 2024-11-06 20:19:57: 8,477,530,142 states generated (29,135,799 s/min), 764,903,184 distinct states found (2,305,959 ds/min), 154,997,361 states left on queue.
+Progress(40) at 2024-11-06 20:20:57: 8,507,035,930 states generated (29,505,788 s/min), 766,887,142 distinct states found (1,983,958 ds/min), 155,034,831 states left on queue.
+Progress(40) at 2024-11-06 20:21:57: 8,536,505,703 states generated (29,469,773 s/min), 769,048,483 distinct states found (2,161,341 ds/min), 155,183,742 states left on queue.
+Progress(40) at 2024-11-06 20:22:57: 8,565,867,584 states generated (29,361,881 s/min), 771,258,076 distinct states found (2,209,593 ds/min), 155,385,262 states left on queue.
+Progress(40) at 2024-11-06 20:23:57: 8,595,185,764 states generated (29,318,180 s/min), 773,454,985 distinct states found (2,196,909 ds/min), 155,614,111 states left on queue.
+Progress(40) at 2024-11-06 20:24:57: 8,624,496,269 states generated (29,310,505 s/min), 775,619,630 distinct states found (2,164,645 ds/min), 155,798,174 states left on queue.
+Progress(40) at 2024-11-06 20:25:57: 8,654,080,073 states generated (29,583,804 s/min), 777,637,410 distinct states found (2,017,780 ds/min), 155,782,045 states left on queue.
+Progress(40) at 2024-11-06 20:26:57: 8,683,722,009 states generated (29,641,936 s/min), 779,940,399 distinct states found (2,302,989 ds/min), 156,073,330 states left on queue.
+Progress(40) at 2024-11-06 20:27:57: 8,713,410,725 states generated (29,688,716 s/min), 782,406,987 distinct states found (2,466,588 ds/min), 156,445,902 states left on queue.
+Progress(40) at 2024-11-06 20:28:57: 8,743,158,002 states generated (29,747,277 s/min), 784,542,609 distinct states found (2,135,622 ds/min), 156,539,841 states left on queue.
+Progress(40) at 2024-11-06 20:29:57: 8,772,688,809 states generated (29,530,807 s/min), 786,583,608 distinct states found (2,040,999 ds/min), 156,630,041 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 20:30:57)
+Progress(40) at 2024-11-06 20:30:57: 8,802,299,219 states generated (29,610,410 s/min), 788,709,007 distinct states found (2,125,399 ds/min), 156,780,966 states left on queue.
+Progress(40) at 2024-11-06 20:31:57: 8,831,545,663 states generated (29,246,444 s/min), 790,874,634 distinct states found (2,165,627 ds/min), 156,943,688 states left on queue.
+Progress(40) at 2024-11-06 20:32:57: 8,860,742,526 states generated (29,196,863 s/min), 793,218,612 distinct states found (2,343,978 ds/min), 157,247,738 states left on queue.
+Progress(40) at 2024-11-06 20:33:57: 8,890,145,689 states generated (29,403,163 s/min), 795,347,746 distinct states found (2,129,134 ds/min), 157,376,715 states left on queue.
+Progress(40) at 2024-11-06 20:34:57: 8,919,277,440 states generated (29,131,751 s/min), 797,557,991 distinct states found (2,210,245 ds/min), 157,566,508 states left on queue.
+Progress(40) at 2024-11-06 20:35:57: 8,948,368,355 states generated (29,090,915 s/min), 799,870,441 distinct states found (2,312,450 ds/min), 157,825,337 states left on queue.
+Progress(40) at 2024-11-06 20:36:57: 8,977,811,769 states generated (29,443,414 s/min), 801,992,418 distinct states found (2,121,977 ds/min), 158,015,008 states left on queue.
+Progress(40) at 2024-11-06 20:37:57: 9,007,285,675 states generated (29,473,906 s/min), 804,250,024 distinct states found (2,257,606 ds/min), 158,295,507 states left on queue.
+Progress(40) at 2024-11-06 20:38:57: 9,036,450,953 states generated (29,165,278 s/min), 806,795,860 distinct states found (2,545,836 ds/min), 158,767,907 states left on queue.
+Progress(40) at 2024-11-06 20:39:57: 9,065,704,268 states generated (29,253,315 s/min), 809,198,438 distinct states found (2,402,578 ds/min), 159,105,121 states left on queue.
+Progress(40) at 2024-11-06 20:40:57: 9,095,165,427 states generated (29,461,159 s/min), 811,512,584 distinct states found (2,314,146 ds/min), 159,345,117 states left on queue.
+Progress(40) at 2024-11-06 20:41:57: 9,124,541,297 states generated (29,375,870 s/min), 813,905,920 distinct states found (2,393,336 ds/min), 159,672,325 states left on queue.
+Progress(40) at 2024-11-06 20:42:57: 9,153,712,591 states generated (29,171,294 s/min), 816,392,570 distinct states found (2,486,650 ds/min), 160,082,547 states left on queue.
+Progress(40) at 2024-11-06 20:43:57: 9,182,920,866 states generated (29,208,275 s/min), 818,845,538 distinct states found (2,452,968 ds/min), 160,476,056 states left on queue.
+Progress(40) at 2024-11-06 20:44:57: 9,212,093,614 states generated (29,172,748 s/min), 821,212,595 distinct states found (2,367,057 ds/min), 160,787,698 states left on queue.
+Progress(40) at 2024-11-06 20:45:57: 9,241,177,362 states generated (29,083,748 s/min), 823,731,111 distinct states found (2,518,516 ds/min), 161,227,975 states left on queue.
+Progress(40) at 2024-11-06 20:46:57: 9,270,666,448 states generated (29,489,086 s/min), 825,877,262 distinct states found (2,146,151 ds/min), 161,339,209 states left on queue.
+Progress(40) at 2024-11-06 20:47:57: 9,299,985,513 states generated (29,319,065 s/min), 828,195,512 distinct states found (2,318,250 ds/min), 161,644,069 states left on queue.
+Progress(40) at 2024-11-06 20:48:57: 9,329,155,005 states generated (29,169,492 s/min), 830,386,518 distinct states found (2,191,006 ds/min), 161,807,802 states left on queue.
+Progress(40) at 2024-11-06 20:49:57: 9,358,433,771 states generated (29,278,766 s/min), 832,419,931 distinct states found (2,033,413 ds/min), 161,882,018 states left on queue.
+Progress(40) at 2024-11-06 20:50:57: 9,387,665,287 states generated (29,231,516 s/min), 834,751,267 distinct states found (2,331,336 ds/min), 162,183,217 states left on queue.
+Progress(40) at 2024-11-06 20:51:57: 9,416,697,647 states generated (29,032,360 s/min), 837,127,657 distinct states found (2,376,390 ds/min), 162,511,558 states left on queue.
+Progress(40) at 2024-11-06 20:52:57: 9,445,747,666 states generated (29,050,019 s/min), 839,556,372 distinct states found (2,428,715 ds/min), 162,873,418 states left on queue.
+Progress(40) at 2024-11-06 20:53:57: 9,474,599,613 states generated (28,851,947 s/min), 841,985,780 distinct states found (2,429,408 ds/min), 163,231,531 states left on queue.
+Progress(40) at 2024-11-06 20:54:57: 9,503,408,525 states generated (28,808,912 s/min), 844,368,680 distinct states found (2,382,900 ds/min), 163,533,407 states left on queue.
+Progress(40) at 2024-11-06 20:55:57: 9,532,128,492 states generated (28,719,967 s/min), 846,804,519 distinct states found (2,435,839 ds/min), 163,787,695 states left on queue.
+Progress(40) at 2024-11-06 20:56:57: 9,560,935,598 states generated (28,807,106 s/min), 849,075,143 distinct states found (2,270,624 ds/min), 163,946,240 states left on queue.
+Progress(40) at 2024-11-06 20:57:57: 9,590,127,374 states generated (29,191,776 s/min), 851,260,378 distinct states found (2,185,235 ds/min), 164,077,372 states left on queue.
+Progress(40) at 2024-11-06 20:58:57: 9,619,514,341 states generated (29,386,967 s/min), 853,352,738 distinct states found (2,092,360 ds/min), 164,118,186 states left on queue.
+Progress(40) at 2024-11-06 20:59:57: 9,648,985,302 states generated (29,470,961 s/min), 855,543,408 distinct states found (2,190,670 ds/min), 164,279,076 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 21:00:58)
+Progress(40) at 2024-11-06 21:00:58: 9,678,677,722 states generated (29,692,420 s/min), 857,894,775 distinct states found (2,351,367 ds/min), 164,516,395 states left on queue.
+Progress(40) at 2024-11-06 21:01:58: 9,708,095,509 states generated (29,417,787 s/min), 860,383,155 distinct states found (2,488,380 ds/min), 164,898,153 states left on queue.
+Progress(40) at 2024-11-06 21:02:58: 9,737,488,378 states generated (29,392,869 s/min), 862,497,194 distinct states found (2,114,039 ds/min), 164,966,010 states left on queue.
+Progress(40) at 2024-11-06 21:03:58: 9,766,895,552 states generated (29,407,174 s/min), 864,819,525 distinct states found (2,322,331 ds/min), 165,232,701 states left on queue.
+Progress(40) at 2024-11-06 21:04:58: 9,796,208,300 states generated (29,312,748 s/min), 867,276,841 distinct states found (2,457,316 ds/min), 165,568,613 states left on queue.
+Progress(40) at 2024-11-06 21:05:58: 9,825,603,726 states generated (29,395,426 s/min), 869,434,526 distinct states found (2,157,685 ds/min), 165,685,610 states left on queue.
+Progress(40) at 2024-11-06 21:06:58: 9,854,789,772 states generated (29,186,046 s/min), 871,934,034 distinct states found (2,499,508 ds/min), 166,084,000 states left on queue.
+Progress(40) at 2024-11-06 21:07:58: 9,884,028,390 states generated (29,238,618 s/min), 874,443,659 distinct states found (2,509,625 ds/min), 166,483,652 states left on queue.
+Progress(40) at 2024-11-06 21:08:58: 9,913,377,669 states generated (29,349,279 s/min), 876,803,913 distinct states found (2,360,254 ds/min), 166,740,702 states left on queue.
+Progress(40) at 2024-11-06 21:09:58: 9,942,721,749 states generated (29,344,080 s/min), 879,187,270 distinct states found (2,383,357 ds/min), 166,953,562 states left on queue.
+Progress(41) at 2024-11-06 21:10:58: 9,972,078,704 states generated (29,356,955 s/min), 881,233,361 distinct states found (2,046,091 ds/min), 166,999,841 states left on queue.
+Progress(41) at 2024-11-06 21:11:58: 10,000,914,792 states generated (28,836,088 s/min), 883,811,441 distinct states found (2,578,080 ds/min), 167,466,583 states left on queue.
+Progress(41) at 2024-11-06 21:12:58: 10,030,210,434 states generated (29,295,642 s/min), 885,899,950 distinct states found (2,088,509 ds/min), 167,531,826 states left on queue.
+Progress(41) at 2024-11-06 21:13:58: 10,059,587,070 states generated (29,376,636 s/min), 888,188,669 distinct states found (2,288,719 ds/min), 167,753,242 states left on queue.
+Progress(41) at 2024-11-06 21:14:58: 10,089,078,901 states generated (29,491,831 s/min), 890,649,997 distinct states found (2,461,328 ds/min), 168,098,890 states left on queue.
+Progress(41) at 2024-11-06 21:15:58: 10,118,348,352 states generated (29,269,451 s/min), 892,695,892 distinct states found (2,045,895 ds/min), 168,141,532 states left on queue.
+Progress(41) at 2024-11-06 21:16:58: 10,147,644,676 states generated (29,296,324 s/min), 894,823,997 distinct states found (2,128,105 ds/min), 168,231,032 states left on queue.
+Progress(41) at 2024-11-06 21:17:58: 10,176,967,773 states generated (29,323,097 s/min), 897,225,523 distinct states found (2,401,526 ds/min), 168,555,740 states left on queue.
+Progress(41) at 2024-11-06 21:18:58: 10,206,275,174 states generated (29,307,401 s/min), 899,814,626 distinct states found (2,589,103 ds/min), 169,020,971 states left on queue.
+Progress(41) at 2024-11-06 21:19:58: 10,235,593,993 states generated (29,318,819 s/min), 902,141,356 distinct states found (2,326,730 ds/min), 169,267,251 states left on queue.
+Progress(41) at 2024-11-06 21:20:58: 10,264,799,049 states generated (29,205,056 s/min), 904,746,333 distinct states found (2,604,977 ds/min), 169,758,459 states left on queue.
+Progress(41) at 2024-11-06 21:21:58: 10,293,910,586 states generated (29,111,537 s/min), 907,433,182 distinct states found (2,686,849 ds/min), 170,277,176 states left on queue.
+Progress(41) at 2024-11-06 21:22:58: 10,323,190,750 states generated (29,280,164 s/min), 910,052,108 distinct states found (2,618,926 ds/min), 170,695,212 states left on queue.
+Progress(41) at 2024-11-06 21:23:58: 10,352,580,182 states generated (29,389,432 s/min), 912,516,064 distinct states found (2,463,956 ds/min), 171,083,771 states left on queue.
+Progress(41) at 2024-11-06 21:24:58: 10,381,951,479 states generated (29,371,297 s/min), 914,781,443 distinct states found (2,265,379 ds/min), 171,281,545 states left on queue.
+Progress(41) at 2024-11-06 21:25:58: 10,411,026,945 states generated (29,075,466 s/min), 917,078,052 distinct states found (2,296,609 ds/min), 171,498,613 states left on queue.
+Progress(41) at 2024-11-06 21:26:58: 10,439,904,441 states generated (28,877,496 s/min), 919,547,808 distinct states found (2,469,756 ds/min), 171,860,589 states left on queue.
+Progress(41) at 2024-11-06 21:27:58: 10,469,008,600 states generated (29,104,159 s/min), 921,912,547 distinct states found (2,364,739 ds/min), 172,121,551 states left on queue.
+Progress(41) at 2024-11-06 21:28:58: 10,497,834,986 states generated (28,826,386 s/min), 924,235,840 distinct states found (2,323,293 ds/min), 172,353,661 states left on queue.
+Progress(41) at 2024-11-06 21:29:58: 10,527,064,696 states generated (29,229,710 s/min), 926,456,744 distinct states found (2,220,904 ds/min), 172,508,439 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 21:30:59)
+Progress(41) at 2024-11-06 21:30:59: 10,556,579,142 states generated (29,514,446 s/min), 928,988,872 distinct states found (2,532,128 ds/min), 172,833,183 states left on queue.
+Progress(41) at 2024-11-06 21:31:59: 10,585,719,909 states generated (29,140,767 s/min), 930,745,149 distinct states found (1,756,277 ds/min), 172,622,496 states left on queue.
+Progress(41) at 2024-11-06 21:32:59: 10,614,881,115 states generated (29,161,206 s/min), 932,948,083 distinct states found (2,202,934 ds/min), 172,792,818 states left on queue.
+Progress(41) at 2024-11-06 21:33:59: 10,643,693,909 states generated (28,812,794 s/min), 935,441,862 distinct states found (2,493,779 ds/min), 173,119,721 states left on queue.
+Progress(41) at 2024-11-06 21:34:59: 10,672,671,166 states generated (28,977,257 s/min), 937,653,961 distinct states found (2,212,099 ds/min), 173,216,843 states left on queue.
+Progress(41) at 2024-11-06 21:35:59: 10,702,072,440 states generated (29,401,274 s/min), 939,638,920 distinct states found (1,984,959 ds/min), 173,254,076 states left on queue.
+Progress(41) at 2024-11-06 21:36:59: 10,731,415,292 states generated (29,342,852 s/min), 941,583,653 distinct states found (1,944,733 ds/min), 173,229,968 states left on queue.
+Progress(41) at 2024-11-06 21:37:59: 10,760,802,656 states generated (29,387,364 s/min), 943,770,610 distinct states found (2,186,957 ds/min), 173,412,799 states left on queue.
+Progress(41) at 2024-11-06 21:38:59: 10,789,961,996 states generated (29,159,340 s/min), 945,790,519 distinct states found (2,019,909 ds/min), 173,482,204 states left on queue.
+Progress(41) at 2024-11-06 21:39:59: 10,819,303,972 states generated (29,341,976 s/min), 947,902,156 distinct states found (2,111,637 ds/min), 173,640,941 states left on queue.
+Progress(41) at 2024-11-06 21:40:59: 10,848,636,471 states generated (29,332,499 s/min), 949,908,145 distinct states found (2,005,989 ds/min), 173,684,074 states left on queue.
+Progress(41) at 2024-11-06 21:41:59: 10,878,207,345 states generated (29,570,874 s/min), 951,870,784 distinct states found (1,962,639 ds/min), 173,648,255 states left on queue.
+Progress(41) at 2024-11-06 21:42:59: 10,907,777,091 states generated (29,569,746 s/min), 954,123,321 distinct states found (2,252,537 ds/min), 173,881,583 states left on queue.
+Progress(41) at 2024-11-06 21:43:59: 10,937,383,465 states generated (29,606,374 s/min), 956,486,701 distinct states found (2,363,380 ds/min), 174,173,694 states left on queue.
+Progress(41) at 2024-11-06 21:44:59: 10,967,070,713 states generated (29,687,248 s/min), 958,539,717 distinct states found (2,053,016 ds/min), 174,194,592 states left on queue.
+Progress(41) at 2024-11-06 21:45:59: 10,996,524,132 states generated (29,453,419 s/min), 960,439,766 distinct states found (1,900,049 ds/min), 174,165,777 states left on queue.
+Progress(41) at 2024-11-06 21:46:59: 11,025,919,452 states generated (29,395,320 s/min), 962,518,661 distinct states found (2,078,895 ds/min), 174,284,642 states left on queue.
+Progress(41) at 2024-11-06 21:47:59: 11,055,087,136 states generated (29,167,684 s/min), 964,440,130 distinct states found (1,921,469 ds/min), 174,253,951 states left on queue.
+Progress(41) at 2024-11-06 21:48:59: 11,084,346,164 states generated (29,259,028 s/min), 966,652,841 distinct states found (2,212,711 ds/min), 174,452,762 states left on queue.
+Progress(41) at 2024-11-06 21:49:59: 11,113,503,996 states generated (29,157,832 s/min), 968,786,590 distinct states found (2,133,749 ds/min), 174,578,147 states left on queue.
+Progress(41) at 2024-11-06 21:50:59: 11,142,862,327 states generated (29,358,331 s/min), 970,780,918 distinct states found (1,994,328 ds/min), 174,585,050 states left on queue.
+Progress(41) at 2024-11-06 21:51:59: 11,171,907,560 states generated (29,045,233 s/min), 972,924,432 distinct states found (2,143,514 ds/min), 174,718,189 states left on queue.
+Progress(41) at 2024-11-06 21:52:59: 11,201,055,602 states generated (29,148,042 s/min), 975,106,131 distinct states found (2,181,699 ds/min), 174,874,035 states left on queue.
+Progress(41) at 2024-11-06 21:53:59: 11,230,576,268 states generated (29,520,666 s/min), 977,176,048 distinct states found (2,069,917 ds/min), 175,042,666 states left on queue.
+Progress(41) at 2024-11-06 21:54:59: 11,259,928,257 states generated (29,351,989 s/min), 979,337,351 distinct states found (2,161,303 ds/min), 175,248,665 states left on queue.
+Progress(41) at 2024-11-06 21:55:59: 11,289,190,366 states generated (29,262,109 s/min), 981,837,130 distinct states found (2,499,779 ds/min), 175,680,736 states left on queue.
+Progress(41) at 2024-11-06 21:56:59: 11,318,399,828 states generated (29,209,462 s/min), 984,112,195 distinct states found (2,275,065 ds/min), 175,913,580 states left on queue.
+Progress(41) at 2024-11-06 21:57:59: 11,347,862,845 states generated (29,463,017 s/min), 986,368,069 distinct states found (2,255,874 ds/min), 176,126,523 states left on queue.
+Progress(41) at 2024-11-06 21:58:59: 11,377,318,937 states generated (29,456,092 s/min), 988,548,686 distinct states found (2,180,617 ds/min), 176,253,552 states left on queue.
+Progress(41) at 2024-11-06 21:59:59: 11,406,551,913 states generated (29,232,976 s/min), 990,875,071 distinct states found (2,326,385 ds/min), 176,528,465 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 22:00:59)
+Progress(41) at 2024-11-06 22:00:59: 11,436,006,666 states generated (29,454,753 s/min), 993,234,999 distinct states found (2,359,928 ds/min), 176,816,755 states left on queue.
+Progress(41) at 2024-11-06 22:01:59: 11,465,207,151 states generated (29,200,485 s/min), 995,557,179 distinct states found (2,322,180 ds/min), 177,094,397 states left on queue.
+Progress(41) at 2024-11-06 22:02:59: 11,494,298,575 states generated (29,091,424 s/min), 997,927,812 distinct states found (2,370,633 ds/min), 177,411,890 states left on queue.
+Progress(41) at 2024-11-06 22:03:59: 11,523,576,632 states generated (29,278,057 s/min), 1,000,196,030 distinct states found (2,268,218 ds/min), 177,640,656 states left on queue.
+Progress(41) at 2024-11-06 22:04:59: 11,552,734,483 states generated (29,157,851 s/min), 1,002,452,277 distinct states found (2,256,247 ds/min), 177,827,247 states left on queue.
+Progress(41) at 2024-11-06 22:05:59: 11,582,200,298 states generated (29,465,815 s/min), 1,004,593,818 distinct states found (2,141,541 ds/min), 177,983,707 states left on queue.
+Progress(41) at 2024-11-06 22:06:59: 11,611,484,149 states generated (29,283,851 s/min), 1,006,774,383 distinct states found (2,180,565 ds/min), 178,161,577 states left on queue.
+Progress(41) at 2024-11-06 22:07:59: 11,640,449,232 states generated (28,965,083 s/min), 1,008,870,356 distinct states found (2,095,973 ds/min), 178,245,657 states left on queue.
+Progress(41) at 2024-11-06 22:08:59: 11,669,695,402 states generated (29,246,170 s/min), 1,010,743,262 distinct states found (1,872,906 ds/min), 178,199,630 states left on queue.
+Progress(41) at 2024-11-06 22:09:59: 11,698,855,657 states generated (29,160,255 s/min), 1,012,993,163 distinct states found (2,249,901 ds/min), 178,433,806 states left on queue.
+Progress(41) at 2024-11-06 22:10:59: 11,727,873,536 states generated (29,017,879 s/min), 1,015,222,628 distinct states found (2,229,465 ds/min), 178,645,315 states left on queue.
+Progress(41) at 2024-11-06 22:11:59: 11,756,910,696 states generated (29,037,160 s/min), 1,017,493,811 distinct states found (2,271,183 ds/min), 178,885,854 states left on queue.
+Progress(41) at 2024-11-06 22:12:59: 11,785,841,957 states generated (28,931,261 s/min), 1,019,798,730 distinct states found (2,304,919 ds/min), 179,138,831 states left on queue.
+Progress(41) at 2024-11-06 22:13:59: 11,814,627,351 states generated (28,785,394 s/min), 1,022,115,935 distinct states found (2,317,205 ds/min), 179,401,355 states left on queue.
+Progress(41) at 2024-11-06 22:14:59: 11,843,482,288 states generated (28,854,937 s/min), 1,024,372,991 distinct states found (2,257,056 ds/min), 179,570,167 states left on queue.
+Progress(41) at 2024-11-06 22:15:59: 11,872,232,503 states generated (28,750,215 s/min), 1,026,655,919 distinct states found (2,282,928 ds/min), 179,704,400 states left on queue.
+Progress(41) at 2024-11-06 22:16:59: 11,901,011,327 states generated (28,778,824 s/min), 1,028,780,151 distinct states found (2,124,232 ds/min), 179,744,822 states left on queue.
+Progress(41) at 2024-11-06 22:17:59: 11,930,078,061 states generated (29,066,734 s/min), 1,030,863,673 distinct states found (2,083,522 ds/min), 179,790,662 states left on queue.
+Progress(41) at 2024-11-06 22:18:59: 11,959,463,901 states generated (29,385,840 s/min), 1,032,840,344 distinct states found (1,976,671 ds/min), 179,738,442 states left on queue.
+Progress(41) at 2024-11-06 22:19:59: 11,988,811,132 states generated (29,347,231 s/min), 1,034,897,049 distinct states found (2,056,705 ds/min), 179,788,782 states left on queue.
+Progress(41) at 2024-11-06 22:20:59: 12,018,335,911 states generated (29,524,779 s/min), 1,037,158,579 distinct states found (2,261,530 ds/min), 179,978,226 states left on queue.
+Progress(41) at 2024-11-06 22:21:59: 12,047,755,593 states generated (29,419,682 s/min), 1,039,437,623 distinct states found (2,279,044 ds/min), 180,177,371 states left on queue.
+Progress(41) at 2024-11-06 22:22:59: 12,077,111,001 states generated (29,355,408 s/min), 1,041,672,961 distinct states found (2,235,338 ds/min), 180,336,777 states left on queue.
+Progress(41) at 2024-11-06 22:23:59: 12,106,556,177 states generated (29,445,176 s/min), 1,043,675,880 distinct states found (2,002,919 ds/min), 180,345,759 states left on queue.
+Progress(41) at 2024-11-06 22:24:59: 12,135,797,446 states generated (29,241,269 s/min), 1,045,966,606 distinct states found (2,290,726 ds/min), 180,552,887 states left on queue.
+Progress(41) at 2024-11-06 22:25:59: 12,165,143,756 states generated (29,346,310 s/min), 1,048,373,643 distinct states found (2,407,037 ds/min), 180,860,142 states left on queue.
+Progress(41) at 2024-11-06 22:26:59: 12,194,478,236 states generated (29,334,480 s/min), 1,050,403,560 distinct states found (2,029,917 ds/min), 180,873,811 states left on queue.
+Progress(41) at 2024-11-06 22:27:59: 12,223,653,080 states generated (29,174,844 s/min), 1,052,798,502 distinct states found (2,394,942 ds/min), 181,184,025 states left on queue.
+Progress(41) at 2024-11-06 22:28:59: 12,252,926,784 states generated (29,273,704 s/min), 1,055,243,990 distinct states found (2,445,488 ds/min), 181,542,525 states left on queue.
+Progress(41) at 2024-11-06 22:29:59: 12,282,176,071 states generated (29,249,287 s/min), 1,057,488,489 distinct states found (2,244,499 ds/min), 181,704,266 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 22:31:00)
+Progress(41) at 2024-11-06 22:31:00: 12,311,654,529 states generated (29,478,458 s/min), 1,059,789,296 distinct states found (2,300,807 ds/min), 181,875,392 states left on queue.
+Progress(41) at 2024-11-06 22:32:00: 12,340,837,903 states generated (29,183,374 s/min), 1,061,857,294 distinct states found (2,067,998 ds/min), 181,860,690 states left on queue.
+Progress(41) at 2024-11-06 22:33:00: 12,369,978,352 states generated (29,140,449 s/min), 1,063,943,173 distinct states found (2,085,879 ds/min), 181,951,091 states left on queue.
+Progress(41) at 2024-11-06 22:34:00: 12,398,820,660 states generated (28,842,308 s/min), 1,066,384,327 distinct states found (2,441,154 ds/min), 182,284,376 states left on queue.
+Progress(41) at 2024-11-06 22:35:00: 12,427,966,245 states generated (29,145,585 s/min), 1,068,376,116 distinct states found (1,991,789 ds/min), 182,275,982 states left on queue.
+Progress(41) at 2024-11-06 22:36:00: 12,457,300,671 states generated (29,334,426 s/min), 1,070,596,949 distinct states found (2,220,833 ds/min), 182,442,278 states left on queue.
+Progress(41) at 2024-11-06 22:37:00: 12,486,769,483 states generated (29,468,812 s/min), 1,072,968,640 distinct states found (2,371,691 ds/min), 182,718,485 states left on queue.
+Progress(41) at 2024-11-06 22:38:00: 12,516,031,360 states generated (29,261,877 s/min), 1,075,001,378 distinct states found (2,032,738 ds/min), 182,729,966 states left on queue.
+Progress(41) at 2024-11-06 22:39:00: 12,545,265,331 states generated (29,233,971 s/min), 1,076,880,794 distinct states found (1,879,416 ds/min), 182,634,798 states left on queue.
+Progress(41) at 2024-11-06 22:40:00: 12,574,495,559 states generated (29,230,228 s/min), 1,079,123,856 distinct states found (2,243,062 ds/min), 182,812,322 states left on queue.
+Progress(41) at 2024-11-06 22:41:00: 12,603,757,387 states generated (29,261,828 s/min), 1,081,610,769 distinct states found (2,486,913 ds/min), 183,219,247 states left on queue.
+Progress(41) at 2024-11-06 22:42:00: 12,632,909,026 states generated (29,151,639 s/min), 1,083,967,637 distinct states found (2,356,868 ds/min), 183,478,879 states left on queue.
+Progress(41) at 2024-11-06 22:43:00: 12,662,254,981 states generated (29,345,955 s/min), 1,086,272,935 distinct states found (2,305,298 ds/min), 183,726,701 states left on queue.
+Progress(41) at 2024-11-06 22:44:00: 12,691,400,218 states generated (29,145,237 s/min), 1,088,778,928 distinct states found (2,505,993 ds/min), 184,128,274 states left on queue.
+Progress(41) at 2024-11-06 22:45:00: 12,720,528,098 states generated (29,127,880 s/min), 1,091,335,929 distinct states found (2,557,001 ds/min), 184,556,078 states left on queue.
+Progress(41) at 2024-11-06 22:46:00: 12,749,701,886 states generated (29,173,788 s/min), 1,093,889,510 distinct states found (2,553,581 ds/min), 184,916,391 states left on queue.
+Progress(41) at 2024-11-06 22:47:00: 12,779,153,937 states generated (29,452,051 s/min), 1,096,185,973 distinct states found (2,296,463 ds/min), 185,115,877 states left on queue.
+Progress(41) at 2024-11-06 22:48:00: 12,808,440,971 states generated (29,287,034 s/min), 1,098,733,865 distinct states found (2,547,892 ds/min), 185,564,617 states left on queue.
+Progress(41) at 2024-11-06 22:49:00: 12,837,695,256 states generated (29,254,285 s/min), 1,100,705,460 distinct states found (1,971,595 ds/min), 185,532,558 states left on queue.
+Progress(41) at 2024-11-06 22:50:00: 12,866,801,129 states generated (29,105,873 s/min), 1,103,074,603 distinct states found (2,369,143 ds/min), 185,770,427 states left on queue.
+Progress(41) at 2024-11-06 22:51:00: 12,895,682,870 states generated (28,881,741 s/min), 1,105,437,747 distinct states found (2,363,144 ds/min), 186,049,274 states left on queue.
+Progress(41) at 2024-11-06 22:52:00: 12,924,655,990 states generated (28,973,120 s/min), 1,107,853,554 distinct states found (2,415,807 ds/min), 186,325,129 states left on queue.
+Progress(41) at 2024-11-06 22:53:00: 12,953,616,826 states generated (28,960,836 s/min), 1,110,097,321 distinct states found (2,243,767 ds/min), 186,509,276 states left on queue.
+Progress(41) at 2024-11-06 22:54:00: 12,982,711,068 states generated (29,094,242 s/min), 1,112,146,097 distinct states found (2,048,776 ds/min), 186,507,356 states left on queue.
+Progress(41) at 2024-11-06 22:55:00: 13,011,962,667 states generated (29,251,599 s/min), 1,114,530,785 distinct states found (2,384,688 ds/min), 186,758,016 states left on queue.
+Progress(41) at 2024-11-06 22:56:00: 13,041,163,382 states generated (29,200,715 s/min), 1,116,566,038 distinct states found (2,035,253 ds/min), 186,702,453 states left on queue.
+Progress(41) at 2024-11-06 22:57:00: 13,070,416,604 states generated (29,253,222 s/min), 1,118,433,735 distinct states found (1,867,697 ds/min), 186,595,926 states left on queue.
+Progress(41) at 2024-11-06 22:58:00: 13,099,393,765 states generated (28,977,161 s/min), 1,120,727,626 distinct states found (2,293,891 ds/min), 186,785,521 states left on queue.
+Progress(41) at 2024-11-06 22:59:00: 13,128,309,003 states generated (28,915,238 s/min), 1,123,075,278 distinct states found (2,347,652 ds/min), 186,977,496 states left on queue.
+Progress(42) at 2024-11-06 23:00:00: 13,157,492,254 states generated (29,183,251 s/min), 1,125,164,050 distinct states found (2,088,772 ds/min), 186,994,591 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 23:01:01)
+Progress(42) at 2024-11-06 23:01:01: 13,187,099,442 states generated (29,607,188 s/min), 1,126,955,828 distinct states found (1,791,778 ds/min), 186,860,457 states left on queue.
+Progress(42) at 2024-11-06 23:02:01: 13,216,408,249 states generated (29,308,807 s/min), 1,128,852,586 distinct states found (1,896,758 ds/min), 186,800,218 states left on queue.
+Progress(42) at 2024-11-06 23:03:01: 13,245,736,139 states generated (29,327,890 s/min), 1,130,960,381 distinct states found (2,107,795 ds/min), 186,911,118 states left on queue.
+Progress(42) at 2024-11-06 23:04:01: 13,274,893,464 states generated (29,157,325 s/min), 1,132,863,930 distinct states found (1,903,549 ds/min), 186,892,129 states left on queue.
+Progress(42) at 2024-11-06 23:05:01: 13,304,183,990 states generated (29,290,526 s/min), 1,134,876,306 distinct states found (2,012,376 ds/min), 186,965,153 states left on queue.
+Progress(42) at 2024-11-06 23:06:01: 13,333,457,770 states generated (29,273,780 s/min), 1,136,812,237 distinct states found (1,935,931 ds/min), 186,957,506 states left on queue.
+Progress(42) at 2024-11-06 23:07:01: 13,362,984,994 states generated (29,527,224 s/min), 1,138,649,876 distinct states found (1,837,639 ds/min), 186,823,887 states left on queue.
+Progress(42) at 2024-11-06 23:08:01: 13,392,550,733 states generated (29,565,739 s/min), 1,140,795,722 distinct states found (2,145,846 ds/min), 186,974,795 states left on queue.
+Progress(42) at 2024-11-06 23:09:01: 13,422,111,300 states generated (29,560,567 s/min), 1,143,038,611 distinct states found (2,242,889 ds/min), 187,179,197 states left on queue.
+Progress(42) at 2024-11-06 23:10:01: 13,451,822,496 states generated (29,711,196 s/min), 1,145,071,502 distinct states found (2,032,891 ds/min), 187,190,480 states left on queue.
+Progress(42) at 2024-11-06 23:11:01: 13,481,293,484 states generated (29,470,988 s/min), 1,146,905,806 distinct states found (1,834,304 ds/min), 187,079,661 states left on queue.
+Progress(42) at 2024-11-06 23:12:01: 13,510,659,679 states generated (29,366,195 s/min), 1,148,841,643 distinct states found (1,935,837 ds/min), 187,082,815 states left on queue.
+Progress(42) at 2024-11-06 23:13:01: 13,539,730,883 states generated (29,071,204 s/min), 1,150,715,436 distinct states found (1,873,793 ds/min), 187,013,975 states left on queue.
+Progress(42) at 2024-11-06 23:14:01: 13,568,973,308 states generated (29,242,425 s/min), 1,152,689,735 distinct states found (1,974,299 ds/min), 187,016,208 states left on queue.
+Progress(42) at 2024-11-06 23:15:01: 13,598,106,627 states generated (29,133,319 s/min), 1,154,829,869 distinct states found (2,140,134 ds/min), 187,147,884 states left on queue.
+Progress(42) at 2024-11-06 23:16:01: 13,627,319,459 states generated (29,212,832 s/min), 1,156,740,070 distinct states found (1,910,201 ds/min), 187,086,942 states left on queue.
+Progress(42) at 2024-11-06 23:17:01: 13,656,462,121 states generated (29,142,662 s/min), 1,158,698,307 distinct states found (1,958,237 ds/min), 187,072,201 states left on queue.
+Progress(42) at 2024-11-06 23:18:01: 13,685,545,941 states generated (29,083,820 s/min), 1,160,688,939 distinct states found (1,990,632 ds/min), 187,078,553 states left on queue.
+Progress(42) at 2024-11-06 23:19:01: 13,714,652,628 states generated (29,106,687 s/min), 1,162,748,633 distinct states found (2,059,694 ds/min), 187,157,229 states left on queue.
+Progress(42) at 2024-11-06 23:20:01: 13,744,105,986 states generated (29,453,358 s/min), 1,164,748,782 distinct states found (2,000,149 ds/min), 187,275,480 states left on queue.
+Progress(42) at 2024-11-06 23:21:01: 13,773,414,393 states generated (29,308,407 s/min), 1,166,804,740 distinct states found (2,055,958 ds/min), 187,393,312 states left on queue.
+Progress(42) at 2024-11-06 23:22:01: 13,802,600,069 states generated (29,185,676 s/min), 1,169,251,493 distinct states found (2,446,753 ds/min), 187,781,298 states left on queue.
+Progress(42) at 2024-11-06 23:23:01: 13,831,830,649 states generated (29,230,580 s/min), 1,171,412,176 distinct states found (2,160,683 ds/min), 187,932,991 states left on queue.
+Progress(42) at 2024-11-06 23:24:01: 13,861,152,221 states generated (29,321,572 s/min), 1,173,582,994 distinct states found (2,170,818 ds/min), 188,078,037 states left on queue.
+Progress(42) at 2024-11-06 23:25:01: 13,890,538,756 states generated (29,386,535 s/min), 1,175,642,901 distinct states found (2,059,907 ds/min), 188,116,794 states left on queue.
+Progress(42) at 2024-11-06 23:26:01: 13,919,812,820 states generated (29,274,064 s/min), 1,177,743,048 distinct states found (2,100,147 ds/min), 188,189,399 states left on queue.
+Progress(42) at 2024-11-06 23:27:01: 13,948,903,585 states generated (29,090,765 s/min), 1,179,980,470 distinct states found (2,237,422 ds/min), 188,388,309 states left on queue.
+Progress(42) at 2024-11-06 23:28:01: 13,978,138,385 states generated (29,234,800 s/min), 1,182,134,981 distinct states found (2,154,511 ds/min), 188,526,735 states left on queue.
+Progress(42) at 2024-11-06 23:29:01: 14,007,310,151 states generated (29,171,766 s/min), 1,184,360,360 distinct states found (2,225,379 ds/min), 188,718,575 states left on queue.
+Progress(42) at 2024-11-06 23:30:01: 14,036,411,110 states generated (29,100,959 s/min), 1,186,617,835 distinct states found (2,257,475 ds/min), 188,941,068 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-06 23:31:01)
+Progress(42) at 2024-11-06 23:31:01: 14,065,894,113 states generated (29,483,003 s/min), 1,188,743,048 distinct states found (2,125,213 ds/min), 189,035,636 states left on queue.
+Progress(42) at 2024-11-06 23:32:01: 14,094,909,096 states generated (29,014,983 s/min), 1,191,096,961 distinct states found (2,353,913 ds/min), 189,332,174 states left on queue.
+Progress(42) at 2024-11-06 23:33:01: 14,124,212,567 states generated (29,303,471 s/min), 1,193,012,997 distinct states found (1,916,036 ds/min), 189,266,016 states left on queue.
+Progress(42) at 2024-11-06 23:34:01: 14,153,428,768 states generated (29,216,201 s/min), 1,195,170,448 distinct states found (2,157,451 ds/min), 189,430,881 states left on queue.
+Progress(42) at 2024-11-06 23:35:01: 14,182,568,290 states generated (29,139,522 s/min), 1,197,127,126 distinct states found (1,956,678 ds/min), 189,423,769 states left on queue.
+Progress(42) at 2024-11-06 23:36:01: 14,211,602,024 states generated (29,033,734 s/min), 1,199,044,612 distinct states found (1,917,486 ds/min), 189,380,199 states left on queue.
+Progress(42) at 2024-11-06 23:37:01: 14,240,593,845 states generated (28,991,821 s/min), 1,200,900,028 distinct states found (1,855,416 ds/min), 189,324,925 states left on queue.
+Progress(42) at 2024-11-06 23:38:01: 14,269,687,808 states generated (29,093,963 s/min), 1,203,034,598 distinct states found (2,134,570 ds/min), 189,466,947 states left on queue.
+Progress(42) at 2024-11-06 23:39:01: 14,298,626,140 states generated (28,938,332 s/min), 1,205,190,806 distinct states found (2,156,208 ds/min), 189,608,794 states left on queue.
+Progress(42) at 2024-11-06 23:40:01: 14,327,587,116 states generated (28,960,976 s/min), 1,207,339,559 distinct states found (2,148,753 ds/min), 189,750,359 states left on queue.
+Progress(42) at 2024-11-06 23:41:01: 14,356,469,494 states generated (28,882,378 s/min), 1,209,518,146 distinct states found (2,178,587 ds/min), 189,892,036 states left on queue.
+Progress(42) at 2024-11-06 23:42:01: 14,385,314,696 states generated (28,845,202 s/min), 1,211,701,473 distinct states found (2,183,327 ds/min), 190,050,090 states left on queue.
+Progress(42) at 2024-11-06 23:43:01: 14,414,142,550 states generated (28,827,854 s/min), 1,213,859,919 distinct states found (2,158,446 ds/min), 190,161,804 states left on queue.
+Progress(42) at 2024-11-06 23:44:01: 14,442,945,644 states generated (28,803,094 s/min), 1,216,005,127 distinct states found (2,145,208 ds/min), 190,173,898 states left on queue.
+Progress(42) at 2024-11-06 23:45:01: 14,471,693,798 states generated (28,748,154 s/min), 1,218,030,292 distinct states found (2,025,165 ds/min), 190,127,864 states left on queue.
+Progress(42) at 2024-11-06 23:46:01: 14,500,599,025 states generated (28,905,227 s/min), 1,219,996,243 distinct states found (1,965,951 ds/min), 190,069,034 states left on queue.
+Progress(42) at 2024-11-06 23:47:01: 14,529,770,118 states generated (29,171,093 s/min), 1,221,890,284 distinct states found (1,894,041 ds/min), 189,948,701 states left on queue.
+Progress(42) at 2024-11-06 23:48:01: 14,559,044,399 states generated (29,274,281 s/min), 1,223,772,100 distinct states found (1,881,816 ds/min), 189,844,417 states left on queue.
+Progress(42) at 2024-11-06 23:49:01: 14,588,505,088 states generated (29,460,689 s/min), 1,225,870,790 distinct states found (2,098,690 ds/min), 189,921,025 states left on queue.
+Progress(42) at 2024-11-06 23:50:01: 14,618,007,797 states generated (29,502,709 s/min), 1,227,944,381 distinct states found (2,073,591 ds/min), 189,947,590 states left on queue.
+Progress(42) at 2024-11-06 23:51:01: 14,647,405,532 states generated (29,397,735 s/min), 1,230,287,712 distinct states found (2,343,331 ds/min), 190,200,223 states left on queue.
+Progress(42) at 2024-11-06 23:52:01: 14,676,733,478 states generated (29,327,946 s/min), 1,232,303,440 distinct states found (2,015,728 ds/min), 190,178,290 states left on queue.
+Progress(42) at 2024-11-06 23:53:01: 14,706,089,483 states generated (29,356,005 s/min), 1,234,269,055 distinct states found (1,965,615 ds/min), 190,175,215 states left on queue.
+Progress(42) at 2024-11-06 23:54:01: 14,735,226,809 states generated (29,137,326 s/min), 1,236,451,189 distinct states found (2,182,134 ds/min), 190,293,853 states left on queue.
+Progress(42) at 2024-11-06 23:55:01: 14,764,611,146 states generated (29,384,337 s/min), 1,238,780,557 distinct states found (2,329,368 ds/min), 190,528,991 states left on queue.
+Progress(42) at 2024-11-06 23:56:01: 14,793,911,038 states generated (29,299,892 s/min), 1,240,745,156 distinct states found (1,964,599 ds/min), 190,493,881 states left on queue.
+Progress(42) at 2024-11-06 23:57:01: 14,823,113,635 states generated (29,202,597 s/min), 1,242,984,781 distinct states found (2,239,625 ds/min), 190,675,723 states left on queue.
+Progress(42) at 2024-11-06 23:58:01: 14,852,208,056 states generated (29,094,421 s/min), 1,245,341,804 distinct states found (2,357,023 ds/min), 190,959,027 states left on queue.
+Progress(42) at 2024-11-06 23:59:01: 14,881,390,523 states generated (29,182,467 s/min), 1,247,530,823 distinct states found (2,189,019 ds/min), 191,085,175 states left on queue.
+Progress(42) at 2024-11-07 00:00:01: 14,910,709,837 states generated (29,319,314 s/min), 1,249,665,632 distinct states found (2,134,809 ds/min), 191,148,911 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 00:01:02)
+Progress(42) at 2024-11-07 00:01:02: 14,940,301,722 states generated (29,591,885 s/min), 1,251,820,098 distinct states found (2,154,466 ds/min), 191,164,099 states left on queue.
+Progress(42) at 2024-11-07 00:02:02: 14,969,468,946 states generated (29,167,224 s/min), 1,253,608,374 distinct states found (1,788,276 ds/min), 190,977,899 states left on queue.
+Progress(42) at 2024-11-07 00:03:02: 14,998,469,861 states generated (29,000,915 s/min), 1,255,846,206 distinct states found (2,237,832 ds/min), 191,179,932 states left on queue.
+Progress(42) at 2024-11-07 00:04:02: 15,027,424,344 states generated (28,954,483 s/min), 1,258,012,253 distinct states found (2,166,047 ds/min), 191,269,006 states left on queue.
+Progress(42) at 2024-11-07 00:05:02: 15,056,595,053 states generated (29,170,709 s/min), 1,259,974,817 distinct states found (1,962,564 ds/min), 191,232,379 states left on queue.
+Progress(42) at 2024-11-07 00:06:02: 15,085,857,792 states generated (29,262,739 s/min), 1,262,139,752 distinct states found (2,164,935 ds/min), 191,351,326 states left on queue.
+Progress(42) at 2024-11-07 00:07:02: 15,115,386,019 states generated (29,528,227 s/min), 1,264,425,723 distinct states found (2,285,971 ds/min), 191,549,077 states left on queue.
+Progress(42) at 2024-11-07 00:08:02: 15,144,705,784 states generated (29,319,765 s/min), 1,266,390,816 distinct states found (1,965,093 ds/min), 191,495,454 states left on queue.
+Progress(42) at 2024-11-07 00:09:02: 15,173,877,454 states generated (29,171,670 s/min), 1,268,144,487 distinct states found (1,753,671 ds/min), 191,300,959 states left on queue.
+Progress(42) at 2024-11-07 00:10:02: 15,203,080,845 states generated (29,203,391 s/min), 1,270,256,870 distinct states found (2,112,383 ds/min), 191,363,085 states left on queue.
+Progress(42) at 2024-11-07 00:11:02: 15,232,426,418 states generated (29,345,573 s/min), 1,272,624,413 distinct states found (2,367,543 ds/min), 191,673,032 states left on queue.
+Progress(42) at 2024-11-07 00:12:02: 15,261,677,209 states generated (29,250,791 s/min), 1,274,995,857 distinct states found (2,371,444 ds/min), 191,960,618 states left on queue.
+Progress(42) at 2024-11-07 00:13:02: 15,290,882,314 states generated (29,205,105 s/min), 1,277,269,501 distinct states found (2,273,644 ds/min), 192,155,220 states left on queue.
+Progress(42) at 2024-11-07 00:14:02: 15,320,166,816 states generated (29,284,502 s/min), 1,279,524,897 distinct states found (2,255,396 ds/min), 192,367,797 states left on queue.
+Progress(42) at 2024-11-07 00:15:02: 15,349,391,017 states generated (29,224,201 s/min), 1,281,912,896 distinct states found (2,387,999 ds/min), 192,657,361 states left on queue.
+Progress(42) at 2024-11-07 00:16:02: 15,378,510,873 states generated (29,119,856 s/min), 1,284,352,819 distinct states found (2,439,923 ds/min), 192,982,001 states left on queue.
+Progress(42) at 2024-11-07 00:17:02: 15,407,729,690 states generated (29,218,817 s/min), 1,286,798,116 distinct states found (2,445,297 ds/min), 193,251,888 states left on queue.
+Progress(42) at 2024-11-07 00:18:02: 15,437,122,682 states generated (29,392,992 s/min), 1,289,060,398 distinct states found (2,262,282 ds/min), 193,393,686 states left on queue.
+Progress(42) at 2024-11-07 00:19:02: 15,466,437,919 states generated (29,315,237 s/min), 1,291,390,007 distinct states found (2,329,609 ds/min), 193,674,611 states left on queue.
+Progress(42) at 2024-11-07 00:20:02: 15,495,795,434 states generated (29,357,515 s/min), 1,293,625,999 distinct states found (2,235,992 ds/min), 193,855,148 states left on queue.
+Progress(42) at 2024-11-07 00:21:02: 15,524,856,146 states generated (29,060,712 s/min), 1,295,675,220 distinct states found (2,049,221 ds/min), 193,858,347 states left on queue.
+Progress(42) at 2024-11-07 00:22:02: 15,553,951,279 states generated (29,095,133 s/min), 1,297,806,219 distinct states found (2,130,999 ds/min), 193,910,330 states left on queue.
+Progress(42) at 2024-11-07 00:23:02: 15,582,781,229 states generated (28,829,950 s/min), 1,300,215,254 distinct states found (2,409,035 ds/min), 194,211,020 states left on queue.
+Progress(42) at 2024-11-07 00:24:02: 15,611,889,872 states generated (29,108,643 s/min), 1,302,431,347 distinct states found (2,216,093 ds/min), 194,324,070 states left on queue.
+Progress(42) at 2024-11-07 00:25:02: 15,640,778,210 states generated (28,888,338 s/min), 1,304,674,839 distinct states found (2,243,492 ds/min), 194,483,563 states left on queue.
+Progress(42) at 2024-11-07 00:26:02: 15,669,830,004 states generated (29,051,794 s/min), 1,306,661,103 distinct states found (1,986,264 ds/min), 194,429,101 states left on queue.
+Progress(42) at 2024-11-07 00:27:02: 15,699,049,213 states generated (29,219,209 s/min), 1,308,920,712 distinct states found (2,259,609 ds/min), 194,577,576 states left on queue.
+Progress(42) at 2024-11-07 00:28:02: 15,728,283,982 states generated (29,234,769 s/min), 1,310,924,780 distinct states found (2,004,068 ds/min), 194,488,601 states left on queue.
+Progress(42) at 2024-11-07 00:29:02: 15,757,507,793 states generated (29,223,811 s/min), 1,312,729,390 distinct states found (1,804,610 ds/min), 194,321,454 states left on queue.
+Progress(42) at 2024-11-07 00:30:02: 15,786,513,733 states generated (29,005,940 s/min), 1,314,926,573 distinct states found (2,197,183 ds/min), 194,422,995 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 00:31:03)
+Progress(42) at 2024-11-07 00:31:03: 15,815,683,048 states generated (29,169,315 s/min), 1,317,135,461 distinct states found (2,208,888 ds/min), 194,492,192 states left on queue.
+Progress(42) at 2024-11-07 00:32:03: 15,844,758,678 states generated (29,075,630 s/min), 1,319,144,875 distinct states found (2,009,414 ds/min), 194,413,387 states left on queue.
+Progress(42) at 2024-11-07 00:33:03: 15,873,998,157 states generated (29,239,479 s/min), 1,320,932,025 distinct states found (1,787,150 ds/min), 194,281,981 states left on queue.
+Progress(42) at 2024-11-07 00:34:03: 15,903,205,479 states generated (29,207,322 s/min), 1,322,654,400 distinct states found (1,722,375 ds/min), 194,091,121 states left on queue.
+Progress(42) at 2024-11-07 00:35:03: 15,932,501,264 states generated (29,295,785 s/min), 1,324,682,430 distinct states found (2,028,030 ds/min), 194,137,494 states left on queue.
+Progress(42) at 2024-11-07 00:36:03: 15,961,589,919 states generated (29,088,655 s/min), 1,326,509,334 distinct states found (1,826,904 ds/min), 194,051,639 states left on queue.
+Progress(42) at 2024-11-07 00:37:03: 15,990,668,327 states generated (29,078,408 s/min), 1,328,357,672 distinct states found (1,848,338 ds/min), 193,989,585 states left on queue.
+Progress(42) at 2024-11-07 00:38:03: 16,019,782,313 states generated (29,113,986 s/min), 1,330,232,446 distinct states found (1,874,774 ds/min), 193,949,446 states left on queue.
+Progress(42) at 2024-11-07 00:39:03: 16,049,252,200 states generated (29,469,887 s/min), 1,331,987,412 distinct states found (1,754,966 ds/min), 193,747,896 states left on queue.
+Progress(42) at 2024-11-07 00:40:03: 16,078,692,514 states generated (29,440,314 s/min), 1,333,894,185 distinct states found (1,906,773 ds/min), 193,729,942 states left on queue.
+Progress(42) at 2024-11-07 00:41:03: 16,108,160,136 states generated (29,467,622 s/min), 1,336,102,661 distinct states found (2,208,476 ds/min), 193,914,624 states left on queue.
+Progress(42) at 2024-11-07 00:42:03: 16,137,813,382 states generated (29,653,246 s/min), 1,338,180,836 distinct states found (2,078,175 ds/min), 193,976,996 states left on queue.
+Progress(43) at 2024-11-07 00:43:03: 16,167,357,885 states generated (29,544,503 s/min), 1,339,957,139 distinct states found (1,776,303 ds/min), 193,787,392 states left on queue.
+Progress(43) at 2024-11-07 00:44:03: 16,196,650,450 states generated (29,292,565 s/min), 1,341,719,088 distinct states found (1,761,949 ds/min), 193,648,551 states left on queue.
+Progress(43) at 2024-11-07 00:45:03: 16,225,735,286 states generated (29,084,836 s/min), 1,343,468,127 distinct states found (1,749,039 ds/min), 193,497,590 states left on queue.
+Progress(43) at 2024-11-07 00:46:03: 16,254,805,612 states generated (29,070,326 s/min), 1,345,280,226 distinct states found (1,812,099 ds/min), 193,364,788 states left on queue.
+Progress(43) at 2024-11-07 00:47:03: 16,283,933,423 states generated (29,127,811 s/min), 1,347,294,879 distinct states found (2,014,653 ds/min), 193,397,713 states left on queue.
+Progress(43) at 2024-11-07 00:48:03: 16,312,911,730 states generated (28,978,307 s/min), 1,349,192,377 distinct states found (1,897,498 ds/min), 193,321,503 states left on queue.
+Progress(43) at 2024-11-07 00:49:03: 16,342,115,657 states generated (29,203,927 s/min), 1,350,961,684 distinct states found (1,769,307 ds/min), 193,144,596 states left on queue.
+Progress(43) at 2024-11-07 00:50:03: 16,370,988,391 states generated (28,872,734 s/min), 1,352,868,904 distinct states found (1,907,220 ds/min), 193,089,969 states left on queue.
+Progress(43) at 2024-11-07 00:51:03: 16,400,089,208 states generated (29,100,817 s/min), 1,354,864,448 distinct states found (1,995,544 ds/min), 193,098,377 states left on queue.
+Progress(43) at 2024-11-07 00:52:03: 16,429,331,456 states generated (29,242,248 s/min), 1,356,734,632 distinct states found (1,870,184 ds/min), 193,093,615 states left on queue.
+Progress(43) at 2024-11-07 00:53:03: 16,458,648,761 states generated (29,317,305 s/min), 1,358,622,917 distinct states found (1,888,285 ds/min), 193,098,172 states left on queue.
+Progress(43) at 2024-11-07 00:54:03: 16,487,874,773 states generated (29,226,012 s/min), 1,360,737,908 distinct states found (2,114,991 ds/min), 193,250,949 states left on queue.
+Progress(43) at 2024-11-07 00:55:03: 16,517,101,401 states generated (29,226,628 s/min), 1,363,024,072 distinct states found (2,286,164 ds/min), 193,508,719 states left on queue.
+Progress(43) at 2024-11-07 00:56:03: 16,546,231,362 states generated (29,129,961 s/min), 1,365,056,771 distinct states found (2,032,699 ds/min), 193,558,441 states left on queue.
+Progress(43) at 2024-11-07 00:57:03: 16,575,532,837 states generated (29,301,475 s/min), 1,367,107,709 distinct states found (2,050,938 ds/min), 193,609,354 states left on queue.
+Progress(43) at 2024-11-07 00:58:03: 16,604,872,137 states generated (29,339,300 s/min), 1,369,059,417 distinct states found (1,951,708 ds/min), 193,561,420 states left on queue.
+Progress(43) at 2024-11-07 00:59:03: 16,634,070,732 states generated (29,198,595 s/min), 1,371,016,928 distinct states found (1,957,511 ds/min), 193,513,278 states left on queue.
+Progress(43) at 2024-11-07 01:00:03: 16,663,158,113 states generated (29,087,381 s/min), 1,373,092,542 distinct states found (2,075,614 ds/min), 193,582,661 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 01:01:03)
+Progress(43) at 2024-11-07 01:01:03: 16,692,576,110 states generated (29,417,997 s/min), 1,375,200,108 distinct states found (2,107,566 ds/min), 193,664,621 states left on queue.
+Progress(43) at 2024-11-07 01:02:03: 16,721,716,479 states generated (29,140,369 s/min), 1,377,247,529 distinct states found (2,047,421 ds/min), 193,708,538 states left on queue.
+Progress(43) at 2024-11-07 01:03:03: 16,750,779,523 states generated (29,063,044 s/min), 1,379,368,087 distinct states found (2,120,558 ds/min), 193,813,065 states left on queue.
+Progress(43) at 2024-11-07 01:04:03: 16,779,794,524 states generated (29,015,001 s/min), 1,381,371,287 distinct states found (2,003,200 ds/min), 193,825,465 states left on queue.
+Progress(43) at 2024-11-07 01:05:03: 16,808,907,203 states generated (29,112,679 s/min), 1,383,515,008 distinct states found (2,143,721 ds/min), 193,953,821 states left on queue.
+Progress(43) at 2024-11-07 01:06:03: 16,838,029,628 states generated (29,122,425 s/min), 1,385,629,882 distinct states found (2,114,874 ds/min), 194,038,163 states left on queue.
+Progress(43) at 2024-11-07 01:07:03: 16,867,418,111 states generated (29,388,483 s/min), 1,387,561,049 distinct states found (1,931,167 ds/min), 194,004,058 states left on queue.
+Progress(43) at 2024-11-07 01:08:03: 16,896,555,416 states generated (29,137,305 s/min), 1,389,592,238 distinct states found (2,031,189 ds/min), 194,058,208 states left on queue.
+Progress(43) at 2024-11-07 01:09:03: 16,925,642,685 states generated (29,087,269 s/min), 1,391,404,896 distinct states found (1,812,658 ds/min), 193,924,951 states left on queue.
+Progress(43) at 2024-11-07 01:10:03: 16,954,638,533 states generated (28,995,848 s/min), 1,393,186,525 distinct states found (1,781,629 ds/min), 193,784,358 states left on queue.
+Progress(43) at 2024-11-07 01:11:03: 16,983,710,894 states generated (29,072,361 s/min), 1,395,018,264 distinct states found (1,831,739 ds/min), 193,697,690 states left on queue.
+Progress(43) at 2024-11-07 01:12:03: 17,012,741,316 states generated (29,030,422 s/min), 1,397,039,325 distinct states found (2,021,061 ds/min), 193,755,919 states left on queue.
+Progress(43) at 2024-11-07 01:13:03: 17,041,674,538 states generated (28,933,222 s/min), 1,399,086,352 distinct states found (2,047,027 ds/min), 193,799,420 states left on queue.
+Progress(43) at 2024-11-07 01:14:03: 17,070,653,912 states generated (28,979,374 s/min), 1,401,092,312 distinct states found (2,005,960 ds/min), 193,820,018 states left on queue.
+Progress(43) at 2024-11-07 01:15:03: 17,099,536,446 states generated (28,882,534 s/min), 1,403,159,743 distinct states found (2,067,431 ds/min), 193,867,947 states left on queue.
+Progress(43) at 2024-11-07 01:16:03: 17,128,396,670 states generated (28,860,224 s/min), 1,405,244,280 distinct states found (2,084,537 ds/min), 193,945,380 states left on queue.
+Progress(43) at 2024-11-07 01:17:03: 17,157,276,177 states generated (28,879,507 s/min), 1,407,274,748 distinct states found (2,030,468 ds/min), 193,944,077 states left on queue.
+Progress(43) at 2024-11-07 01:18:03: 17,186,149,639 states generated (28,873,462 s/min), 1,409,283,088 distinct states found (2,008,340 ds/min), 193,881,792 states left on queue.
+Progress(43) at 2024-11-07 01:19:03: 17,214,923,206 states generated (28,773,567 s/min), 1,411,167,065 distinct states found (1,883,977 ds/min), 193,711,394 states left on queue.
+Progress(43) at 2024-11-07 01:20:03: 17,243,730,245 states generated (28,807,039 s/min), 1,413,023,763 distinct states found (1,856,698 ds/min), 193,546,054 states left on queue.
+Progress(43) at 2024-11-07 01:21:03: 17,272,650,525 states generated (28,920,280 s/min), 1,414,802,171 distinct states found (1,778,408 ds/min), 193,345,308 states left on queue.
+Progress(43) at 2024-11-07 01:22:03: 17,301,943,589 states generated (29,293,064 s/min), 1,416,599,440 distinct states found (1,797,269 ds/min), 193,158,676 states left on queue.
+Progress(43) at 2024-11-07 01:23:03: 17,331,337,313 states generated (29,393,724 s/min), 1,418,547,450 distinct states found (1,948,010 ds/min), 193,112,883 states left on queue.
+Progress(43) at 2024-11-07 01:24:03: 17,360,793,100 states generated (29,455,787 s/min), 1,420,576,018 distinct states found (2,028,568 ds/min), 193,100,476 states left on queue.
+Progress(43) at 2024-11-07 01:25:03: 17,390,123,392 states generated (29,330,292 s/min), 1,422,693,479 distinct states found (2,117,461 ds/min), 193,171,748 states left on queue.
+Progress(43) at 2024-11-07 01:26:03: 17,419,468,515 states generated (29,345,123 s/min), 1,424,783,244 distinct states found (2,089,765 ds/min), 193,228,274 states left on queue.
+Progress(43) at 2024-11-07 01:27:03: 17,448,810,016 states generated (29,341,501 s/min), 1,426,560,811 distinct states found (1,777,567 ds/min), 193,036,459 states left on queue.
+Progress(43) at 2024-11-07 01:28:03: 17,478,034,472 states generated (29,224,456 s/min), 1,428,663,374 distinct states found (2,102,563 ds/min), 193,125,616 states left on queue.
+Progress(43) at 2024-11-07 01:29:03: 17,507,201,835 states generated (29,167,363 s/min), 1,430,735,910 distinct states found (2,072,536 ds/min), 193,146,850 states left on queue.
+Progress(43) at 2024-11-07 01:30:03: 17,536,546,498 states generated (29,344,663 s/min), 1,432,877,950 distinct states found (2,142,040 ds/min), 193,230,645 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 01:31:04)
+Progress(43) at 2024-11-07 01:31:04: 17,566,061,546 states generated (29,515,048 s/min), 1,434,839,708 distinct states found (1,961,758 ds/min), 193,176,951 states left on queue.
+Progress(43) at 2024-11-07 01:32:04: 17,595,015,993 states generated (28,954,447 s/min), 1,436,986,257 distinct states found (2,146,549 ds/min), 193,289,254 states left on queue.
+Progress(43) at 2024-11-07 01:33:04: 17,624,137,153 states generated (29,121,160 s/min), 1,439,279,150 distinct states found (2,292,893 ds/min), 193,525,973 states left on queue.
+Progress(43) at 2024-11-07 01:34:04: 17,653,328,248 states generated (29,191,095 s/min), 1,441,299,767 distinct states found (2,020,617 ds/min), 193,504,947 states left on queue.
+Progress(43) at 2024-11-07 01:35:04: 17,682,562,562 states generated (29,234,314 s/min), 1,443,317,413 distinct states found (2,017,646 ds/min), 193,471,905 states left on queue.
+Progress(43) at 2024-11-07 01:36:04: 17,711,829,397 states generated (29,266,835 s/min), 1,445,304,310 distinct states found (1,986,897 ds/min), 193,370,899 states left on queue.
+Progress(43) at 2024-11-07 01:37:04: 17,740,910,347 states generated (29,080,950 s/min), 1,447,009,563 distinct states found (1,705,253 ds/min), 193,129,235 states left on queue.
+Progress(43) at 2024-11-07 01:38:04: 17,769,836,321 states generated (28,925,974 s/min), 1,449,139,496 distinct states found (2,129,933 ds/min), 193,234,511 states left on queue.
+Progress(43) at 2024-11-07 01:39:04: 17,798,713,067 states generated (28,876,746 s/min), 1,451,211,612 distinct states found (2,072,116 ds/min), 193,241,362 states left on queue.
+Progress(43) at 2024-11-07 01:40:04: 17,827,794,691 states generated (29,081,624 s/min), 1,453,062,046 distinct states found (1,850,434 ds/min), 193,114,753 states left on queue.
+Progress(43) at 2024-11-07 01:41:04: 17,856,974,014 states generated (29,179,323 s/min), 1,455,151,187 distinct states found (2,089,141 ds/min), 193,169,579 states left on queue.
+Progress(43) at 2024-11-07 01:42:04: 17,886,446,666 states generated (29,472,652 s/min), 1,457,303,171 distinct states found (2,151,984 ds/min), 193,263,708 states left on queue.
+Progress(43) at 2024-11-07 01:43:04: 17,915,744,840 states generated (29,298,174 s/min), 1,459,261,460 distinct states found (1,958,289 ds/min), 193,194,468 states left on queue.
+Progress(43) at 2024-11-07 01:44:04: 17,944,793,057 states generated (29,048,217 s/min), 1,460,885,305 distinct states found (1,623,845 ds/min), 192,905,330 states left on queue.
+Progress(43) at 2024-11-07 01:45:04: 17,973,952,967 states generated (29,159,910 s/min), 1,462,880,642 distinct states found (1,995,337 ds/min), 192,871,348 states left on queue.
+Progress(43) at 2024-11-07 01:46:04: 18,003,158,344 states generated (29,205,377 s/min), 1,465,077,846 distinct states found (2,197,204 ds/min), 193,039,702 states left on queue.
+Progress(43) at 2024-11-07 01:47:04: 18,032,464,087 states generated (29,305,743 s/min), 1,467,361,120 distinct states found (2,283,274 ds/min), 193,271,051 states left on queue.
+Progress(43) at 2024-11-07 01:48:04: 18,061,597,682 states generated (29,133,595 s/min), 1,469,505,688 distinct states found (2,144,568 ds/min), 193,354,360 states left on queue.
+Progress(43) at 2024-11-07 01:49:04: 18,090,888,515 states generated (29,290,833 s/min), 1,471,655,035 distinct states found (2,149,347 ds/min), 193,472,080 states left on queue.
+Progress(43) at 2024-11-07 01:50:04: 18,119,855,749 states generated (28,967,234 s/min), 1,473,959,147 distinct states found (2,304,112 ds/min), 193,714,821 states left on queue.
+Progress(43) at 2024-11-07 01:51:04: 18,149,035,954 states generated (29,180,205 s/min), 1,476,253,894 distinct states found (2,294,747 ds/min), 193,939,051 states left on queue.
+Progress(43) at 2024-11-07 01:52:04: 18,178,210,402 states generated (29,174,448 s/min), 1,478,557,699 distinct states found (2,303,805 ds/min), 194,141,809 states left on queue.
+Progress(43) at 2024-11-07 01:53:04: 18,207,377,534 states generated (29,167,132 s/min), 1,480,870,404 distinct states found (2,312,705 ds/min), 194,307,877 states left on queue.
+Progress(43) at 2024-11-07 01:54:04: 18,236,577,989 states generated (29,200,455 s/min), 1,483,070,823 distinct states found (2,200,419 ds/min), 194,387,223 states left on queue.
+Progress(43) at 2024-11-07 01:55:04: 18,265,859,163 states generated (29,281,174 s/min), 1,485,222,154 distinct states found (2,151,331 ds/min), 194,522,233 states left on queue.
+Progress(43) at 2024-11-07 01:56:04: 18,295,148,797 states generated (29,289,634 s/min), 1,487,521,283 distinct states found (2,299,129 ds/min), 194,755,427 states left on queue.
+Progress(43) at 2024-11-07 01:57:04: 18,324,289,175 states generated (29,140,378 s/min), 1,489,367,193 distinct states found (1,845,910 ds/min), 194,604,366 states left on queue.
+Progress(43) at 2024-11-07 01:58:04: 18,353,385,770 states generated (29,096,595 s/min), 1,491,503,782 distinct states found (2,136,589 ds/min), 194,651,670 states left on queue.
+Progress(43) at 2024-11-07 01:59:04: 18,382,277,307 states generated (28,891,537 s/min), 1,493,659,362 distinct states found (2,155,580 ds/min), 194,761,640 states left on queue.
+Progress(43) at 2024-11-07 02:00:04: 18,411,146,853 states generated (28,869,546 s/min), 1,495,935,237 distinct states found (2,275,875 ds/min), 194,908,896 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 02:01:05)
+Progress(43) at 2024-11-07 02:01:05: 18,440,532,837 states generated (29,385,984 s/min), 1,497,937,249 distinct states found (2,002,012 ds/min), 194,874,096 states left on queue.
+Progress(43) at 2024-11-07 02:02:05: 18,469,385,511 states generated (28,852,674 s/min), 1,500,062,084 distinct states found (2,124,835 ds/min), 194,900,930 states left on queue.
+Progress(43) at 2024-11-07 02:03:05: 18,498,509,160 states generated (29,123,649 s/min), 1,502,077,387 distinct states found (2,015,303 ds/min), 194,871,250 states left on queue.
+Progress(43) at 2024-11-07 02:04:05: 18,527,694,520 states generated (29,185,360 s/min), 1,504,242,136 distinct states found (2,164,749 ds/min), 194,924,748 states left on queue.
+Progress(43) at 2024-11-07 02:05:05: 18,556,901,350 states generated (29,206,830 s/min), 1,506,034,687 distinct states found (1,792,551 ds/min), 194,670,609 states left on queue.
+Progress(43) at 2024-11-07 02:06:05: 18,586,004,706 states generated (29,103,356 s/min), 1,507,879,191 distinct states found (1,844,504 ds/min), 194,551,782 states left on queue.
+Progress(43) at 2024-11-07 02:07:05: 18,614,881,319 states generated (28,876,613 s/min), 1,510,019,997 distinct states found (2,140,806 ds/min), 194,594,957 states left on queue.
+Progress(43) at 2024-11-07 02:08:05: 18,643,854,322 states generated (28,973,003 s/min), 1,512,074,165 distinct states found (2,054,168 ds/min), 194,532,832 states left on queue.
+Progress(43) at 2024-11-07 02:09:05: 18,672,998,550 states generated (29,144,228 s/min), 1,513,943,120 distinct states found (1,868,955 ds/min), 194,368,599 states left on queue.
+Progress(43) at 2024-11-07 02:10:05: 18,702,201,308 states generated (29,202,758 s/min), 1,515,546,068 distinct states found (1,602,948 ds/min), 194,090,755 states left on queue.
+Progress(43) at 2024-11-07 02:11:05: 18,731,481,011 states generated (29,279,703 s/min), 1,517,343,788 distinct states found (1,797,720 ds/min), 193,942,961 states left on queue.
+Progress(43) at 2024-11-07 02:12:05: 18,760,609,986 states generated (29,128,975 s/min), 1,519,160,050 distinct states found (1,816,262 ds/min), 193,815,502 states left on queue.
+Progress(43) at 2024-11-07 02:13:05: 18,789,628,202 states generated (29,018,216 s/min), 1,520,860,123 distinct states found (1,700,073 ds/min), 193,642,399 states left on queue.
+Progress(43) at 2024-11-07 02:14:05: 18,818,770,407 states generated (29,142,205 s/min), 1,522,616,126 distinct states found (1,756,003 ds/min), 193,516,180 states left on queue.
+Progress(43) at 2024-11-07 02:15:05: 18,847,943,521 states generated (29,173,114 s/min), 1,524,373,878 distinct states found (1,757,752 ds/min), 193,352,389 states left on queue.
+Progress(43) at 2024-11-07 02:16:05: 18,877,338,814 states generated (29,395,293 s/min), 1,526,022,199 distinct states found (1,648,321 ds/min), 193,099,089 states left on queue.
+Progress(43) at 2024-11-07 02:17:05: 18,906,854,907 states generated (29,516,093 s/min), 1,528,057,287 distinct states found (2,035,088 ds/min), 193,164,007 states left on queue.
+Progress(43) at 2024-11-07 02:18:05: 18,936,272,714 states generated (29,417,807 s/min), 1,530,070,868 distinct states found (2,013,581 ds/min), 193,195,191 states left on queue.
+Progress(43) at 2024-11-07 02:19:05: 18,965,845,291 states generated (29,572,577 s/min), 1,531,953,514 distinct states found (1,882,646 ds/min), 193,094,610 states left on queue.
+Progress(44) at 2024-11-07 02:20:05: 18,995,225,711 states generated (29,380,420 s/min), 1,533,586,486 distinct states found (1,632,972 ds/min), 192,813,292 states left on queue.
+Progress(44) at 2024-11-07 02:21:05: 19,024,424,249 states generated (29,198,538 s/min), 1,535,341,846 distinct states found (1,755,360 ds/min), 192,665,431 states left on queue.
+Progress(44) at 2024-11-07 02:22:05: 19,053,319,611 states generated (28,895,362 s/min), 1,536,913,652 distinct states found (1,571,806 ds/min), 192,336,687 states left on queue.
+Progress(44) at 2024-11-07 02:23:05: 19,082,456,366 states generated (29,136,755 s/min), 1,538,781,638 distinct states found (1,867,986 ds/min), 192,258,068 states left on queue.
+Progress(44) at 2024-11-07 02:24:05: 19,111,445,941 states generated (28,989,575 s/min), 1,540,696,734 distinct states found (1,915,096 ds/min), 192,193,602 states left on queue.
+Progress(44) at 2024-11-07 02:25:05: 19,140,498,683 states generated (29,052,742 s/min), 1,542,368,994 distinct states found (1,672,260 ds/min), 191,938,239 states left on queue.
+Progress(44) at 2024-11-07 02:26:05: 19,169,386,645 states generated (28,887,962 s/min), 1,544,099,236 distinct states found (1,730,242 ds/min), 191,741,059 states left on queue.
+Progress(44) at 2024-11-07 02:27:05: 19,198,354,957 states generated (28,968,312 s/min), 1,545,891,836 distinct states found (1,792,600 ds/min), 191,577,211 states left on queue.
+Progress(44) at 2024-11-07 02:28:05: 19,227,551,398 states generated (29,196,441 s/min), 1,547,751,807 distinct states found (1,859,971 ds/min), 191,530,291 states left on queue.
+Progress(44) at 2024-11-07 02:29:05: 19,256,905,544 states generated (29,354,146 s/min), 1,549,562,753 distinct states found (1,810,946 ds/min), 191,492,536 states left on queue.
+Progress(44) at 2024-11-07 02:30:05: 19,286,043,009 states generated (29,137,465 s/min), 1,551,387,062 distinct states found (1,824,309 ds/min), 191,432,131 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 02:31:05)
+Progress(44) at 2024-11-07 02:31:05: 19,315,478,636 states generated (29,435,627 s/min), 1,553,684,416 distinct states found (2,297,354 ds/min), 191,685,387 states left on queue.
+Progress(44) at 2024-11-07 02:32:05: 19,344,574,433 states generated (29,095,797 s/min), 1,555,642,251 distinct states found (1,957,835 ds/min), 191,687,380 states left on queue.
+Progress(44) at 2024-11-07 02:33:05: 19,373,560,321 states generated (28,985,888 s/min), 1,557,576,032 distinct states found (1,933,781 ds/min), 191,644,771 states left on queue.
+Progress(44) at 2024-11-07 02:34:05: 19,402,882,849 states generated (29,322,528 s/min), 1,559,483,211 distinct states found (1,907,179 ds/min), 191,584,351 states left on queue.
+Progress(44) at 2024-11-07 02:35:05: 19,432,084,827 states generated (29,201,978 s/min), 1,561,305,888 distinct states found (1,822,677 ds/min), 191,432,596 states left on queue.
+Progress(44) at 2024-11-07 02:36:05: 19,461,112,335 states generated (29,027,508 s/min), 1,563,180,797 distinct states found (1,874,909 ds/min), 191,330,553 states left on queue.
+Progress(44) at 2024-11-07 02:37:05: 19,490,043,498 states generated (28,931,163 s/min), 1,565,137,368 distinct states found (1,956,571 ds/min), 191,292,333 states left on queue.
+Progress(44) at 2024-11-07 02:38:05: 19,519,153,014 states generated (29,109,516 s/min), 1,567,034,954 distinct states found (1,897,586 ds/min), 191,229,524 states left on queue.
+Progress(44) at 2024-11-07 02:39:05: 19,548,204,678 states generated (29,051,664 s/min), 1,568,989,443 distinct states found (1,954,489 ds/min), 191,191,752 states left on queue.
+Progress(44) at 2024-11-07 02:40:05: 19,577,227,470 states generated (29,022,792 s/min), 1,570,981,495 distinct states found (1,992,052 ds/min), 191,200,024 states left on queue.
+Progress(44) at 2024-11-07 02:41:05: 19,606,172,601 states generated (28,945,131 s/min), 1,572,870,324 distinct states found (1,888,829 ds/min), 191,115,956 states left on queue.
+Progress(44) at 2024-11-07 02:42:05: 19,635,167,481 states generated (28,994,880 s/min), 1,574,894,468 distinct states found (2,024,144 ds/min), 191,139,869 states left on queue.
+Progress(44) at 2024-11-07 02:43:05: 19,664,339,049 states generated (29,171,568 s/min), 1,576,906,348 distinct states found (2,011,880 ds/min), 191,137,521 states left on queue.
+Progress(44) at 2024-11-07 02:44:05: 19,693,639,689 states generated (29,300,640 s/min), 1,578,748,425 distinct states found (1,842,077 ds/min), 191,040,518 states left on queue.
+Progress(44) at 2024-11-07 02:45:05: 19,722,704,536 states generated (29,064,847 s/min), 1,580,671,538 distinct states found (1,923,113 ds/min), 191,001,469 states left on queue.
+Progress(44) at 2024-11-07 02:46:05: 19,751,627,669 states generated (28,923,133 s/min), 1,582,340,762 distinct states found (1,669,224 ds/min), 190,750,504 states left on queue.
+Progress(44) at 2024-11-07 02:47:05: 19,780,532,535 states generated (28,904,866 s/min), 1,583,965,049 distinct states found (1,624,287 ds/min), 190,492,540 states left on queue.
+Progress(44) at 2024-11-07 02:48:05: 19,809,548,743 states generated (29,016,208 s/min), 1,585,820,774 distinct states found (1,855,725 ds/min), 190,422,454 states left on queue.
+Progress(44) at 2024-11-07 02:49:05: 19,838,541,075 states generated (28,992,332 s/min), 1,587,731,649 distinct states found (1,910,875 ds/min), 190,386,932 states left on queue.
+Progress(44) at 2024-11-07 02:50:05: 19,867,458,320 states generated (28,917,245 s/min), 1,589,622,141 distinct states found (1,890,492 ds/min), 190,310,460 states left on queue.
+Progress(44) at 2024-11-07 02:51:05: 19,896,287,158 states generated (28,828,838 s/min), 1,591,517,151 distinct states found (1,895,010 ds/min), 190,235,561 states left on queue.
+Progress(44) at 2024-11-07 02:52:05: 19,925,117,820 states generated (28,830,662 s/min), 1,593,453,289 distinct states found (1,936,138 ds/min), 190,176,789 states left on queue.
+Progress(44) at 2024-11-07 02:53:05: 19,953,949,651 states generated (28,831,831 s/min), 1,595,392,832 distinct states found (1,939,543 ds/min), 190,137,713 states left on queue.
+Progress(44) at 2024-11-07 02:54:05: 19,982,791,590 states generated (28,841,939 s/min), 1,597,295,182 distinct states found (1,902,350 ds/min), 190,030,864 states left on queue.
+Progress(44) at 2024-11-07 02:55:05: 20,011,631,796 states generated (28,840,206 s/min), 1,599,162,388 distinct states found (1,867,206 ds/min), 189,857,155 states left on queue.
+Progress(44) at 2024-11-07 02:56:05: 20,040,350,017 states generated (28,718,221 s/min), 1,600,882,747 distinct states found (1,720,359 ds/min), 189,556,504 states left on queue.
+Progress(44) at 2024-11-07 02:57:05: 20,069,048,267 states generated (28,698,250 s/min), 1,602,583,945 distinct states found (1,701,198 ds/min), 189,276,085 states left on queue.
+Progress(44) at 2024-11-07 02:58:05: 20,098,037,079 states generated (28,988,812 s/min), 1,604,245,937 distinct states found (1,661,992 ds/min), 188,968,070 states left on queue.
+Progress(44) at 2024-11-07 02:59:05: 20,127,216,730 states generated (29,179,651 s/min), 1,605,916,753 distinct states found (1,670,816 ds/min), 188,703,437 states left on queue.
+Progress(44) at 2024-11-07 03:00:05: 20,156,712,917 states generated (29,496,187 s/min), 1,607,868,866 distinct states found (1,952,113 ds/min), 188,640,553 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 03:01:06)
+Progress(44) at 2024-11-07 03:01:06: 20,186,396,044 states generated (29,683,127 s/min), 1,609,765,772 distinct states found (1,896,906 ds/min), 188,510,848 states left on queue.
+Progress(44) at 2024-11-07 03:02:06: 20,215,754,864 states generated (29,358,820 s/min), 1,611,924,139 distinct states found (2,158,367 ds/min), 188,607,723 states left on queue.
+Progress(44) at 2024-11-07 03:03:06: 20,245,041,982 states generated (29,287,118 s/min), 1,613,794,702 distinct states found (1,870,563 ds/min), 188,472,700 states left on queue.
+Progress(44) at 2024-11-07 03:04:06: 20,274,294,374 states generated (29,252,392 s/min), 1,615,566,733 distinct states found (1,772,031 ds/min), 188,311,061 states left on queue.
+Progress(44) at 2024-11-07 03:05:06: 20,303,317,537 states generated (29,023,163 s/min), 1,617,541,966 distinct states found (1,975,233 ds/min), 188,275,227 states left on queue.
+Progress(44) at 2024-11-07 03:06:06: 20,332,555,917 states generated (29,238,380 s/min), 1,619,626,477 distinct states found (2,084,511 ds/min), 188,311,129 states left on queue.
+Progress(44) at 2024-11-07 03:07:06: 20,361,814,948 states generated (29,259,031 s/min), 1,621,498,944 distinct states found (1,872,467 ds/min), 188,187,982 states left on queue.
+Progress(44) at 2024-11-07 03:08:06: 20,391,066,062 states generated (29,251,114 s/min), 1,623,499,145 distinct states found (2,000,201 ds/min), 188,184,372 states left on queue.
+Progress(44) at 2024-11-07 03:09:06: 20,420,013,539 states generated (28,947,477 s/min), 1,625,534,256 distinct states found (2,035,111 ds/min), 188,202,174 states left on queue.
+Progress(44) at 2024-11-07 03:10:06: 20,449,116,787 states generated (29,103,248 s/min), 1,627,670,135 distinct states found (2,135,879 ds/min), 188,303,061 states left on queue.
+Progress(44) at 2024-11-07 03:11:06: 20,478,265,224 states generated (29,148,437 s/min), 1,629,558,947 distinct states found (1,888,812 ds/min), 188,171,995 states left on queue.
+Progress(44) at 2024-11-07 03:12:06: 20,507,459,785 states generated (29,194,561 s/min), 1,631,460,915 distinct states found (1,901,968 ds/min), 188,044,516 states left on queue.
+Progress(44) at 2024-11-07 03:13:06: 20,536,655,025 states generated (29,195,240 s/min), 1,633,292,515 distinct states found (1,831,600 ds/min), 187,823,678 states left on queue.
+Progress(44) at 2024-11-07 03:14:06: 20,565,699,198 states generated (29,044,173 s/min), 1,634,967,122 distinct states found (1,674,607 ds/min), 187,564,357 states left on queue.
+Progress(44) at 2024-11-07 03:15:06: 20,594,568,781 states generated (28,869,583 s/min), 1,636,996,440 distinct states found (2,029,318 ds/min), 187,577,506 states left on queue.
+Progress(44) at 2024-11-07 03:16:06: 20,623,463,526 states generated (28,894,745 s/min), 1,638,870,718 distinct states found (1,874,278 ds/min), 187,429,057 states left on queue.
+Progress(44) at 2024-11-07 03:17:06: 20,652,517,975 states generated (29,054,449 s/min), 1,640,608,054 distinct states found (1,737,336 ds/min), 187,198,996 states left on queue.
+Progress(44) at 2024-11-07 03:18:06: 20,681,729,377 states generated (29,211,402 s/min), 1,642,682,611 distinct states found (2,074,557 ds/min), 187,238,673 states left on queue.
+Progress(44) at 2024-11-07 03:19:06: 20,711,226,363 states generated (29,496,986 s/min), 1,644,764,480 distinct states found (2,081,869 ds/min), 187,269,746 states left on queue.
+Progress(44) at 2024-11-07 03:20:06: 20,740,520,876 states generated (29,294,513 s/min), 1,646,565,948 distinct states found (1,801,468 ds/min), 187,085,841 states left on queue.
+Progress(44) at 2024-11-07 03:21:06: 20,769,532,066 states generated (29,011,190 s/min), 1,648,139,570 distinct states found (1,573,622 ds/min), 186,737,971 states left on queue.
+Progress(44) at 2024-11-07 03:22:06: 20,798,731,555 states generated (29,199,489 s/min), 1,650,061,318 distinct states found (1,921,748 ds/min), 186,652,080 states left on queue.
+Progress(44) at 2024-11-07 03:23:06: 20,827,864,871 states generated (29,133,316 s/min), 1,652,217,368 distinct states found (2,156,050 ds/min), 186,786,338 states left on queue.
+Progress(44) at 2024-11-07 03:24:06: 20,857,114,542 states generated (29,249,671 s/min), 1,654,404,059 distinct states found (2,186,691 ds/min), 186,937,127 states left on queue.
+Progress(44) at 2024-11-07 03:25:06: 20,886,216,235 states generated (29,101,693 s/min), 1,656,424,687 distinct states found (2,020,628 ds/min), 186,925,384 states left on queue.
+Progress(44) at 2024-11-07 03:26:06: 20,915,415,138 states generated (29,198,903 s/min), 1,658,503,968 distinct states found (2,079,281 ds/min), 186,988,200 states left on queue.
+Progress(44) at 2024-11-07 03:27:06: 20,944,436,117 states generated (29,020,979 s/min), 1,660,708,925 distinct states found (2,204,957 ds/min), 187,151,771 states left on queue.
+Progress(44) at 2024-11-07 03:28:06: 20,973,637,986 states generated (29,201,869 s/min), 1,662,812,161 distinct states found (2,103,236 ds/min), 187,208,363 states left on queue.
+Progress(44) at 2024-11-07 03:29:06: 21,002,664,654 states generated (29,026,668 s/min), 1,665,077,078 distinct states found (2,264,917 ds/min), 187,398,168 states left on queue.
+Progress(44) at 2024-11-07 03:30:06: 21,031,900,683 states generated (29,236,029 s/min), 1,667,241,517 distinct states found (2,164,439 ds/min), 187,444,342 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 03:31:07)
+Progress(44) at 2024-11-07 03:31:07: 21,061,190,967 states generated (29,290,284 s/min), 1,669,337,346 distinct states found (2,095,829 ds/min), 187,431,388 states left on queue.
+Progress(44) at 2024-11-07 03:32:07: 21,090,368,622 states generated (29,177,655 s/min), 1,671,292,120 distinct states found (1,954,774 ds/min), 187,370,395 states left on queue.
+Progress(44) at 2024-11-07 03:33:07: 21,119,546,588 states generated (29,177,966 s/min), 1,673,505,061 distinct states found (2,212,941 ds/min), 187,548,275 states left on queue.
+Progress(44) at 2024-11-07 03:34:07: 21,148,770,544 states generated (29,223,956 s/min), 1,675,679,331 distinct states found (2,174,270 ds/min), 187,681,477 states left on queue.
+Progress(44) at 2024-11-07 03:35:07: 21,177,752,842 states generated (28,982,298 s/min), 1,677,512,003 distinct states found (1,832,672 ds/min), 187,502,663 states left on queue.
+Progress(44) at 2024-11-07 03:36:07: 21,206,777,989 states generated (29,025,147 s/min), 1,679,391,120 distinct states found (1,879,117 ds/min), 187,345,876 states left on queue.
+Progress(44) at 2024-11-07 03:37:07: 21,235,634,562 states generated (28,856,573 s/min), 1,681,551,461 distinct states found (2,160,341 ds/min), 187,464,887 states left on queue.
+Progress(44) at 2024-11-07 03:38:07: 21,264,448,690 states generated (28,814,128 s/min), 1,683,690,836 distinct states found (2,139,375 ds/min), 187,491,922 states left on queue.
+Progress(44) at 2024-11-07 03:39:07: 21,293,469,454 states generated (29,020,764 s/min), 1,685,615,643 distinct states found (1,924,807 ds/min), 187,416,199 states left on queue.
+Progress(44) at 2024-11-07 03:40:07: 21,322,287,082 states generated (28,817,628 s/min), 1,687,574,723 distinct states found (1,959,080 ds/min), 187,310,981 states left on queue.
+Progress(44) at 2024-11-07 03:41:07: 21,351,396,680 states generated (29,109,598 s/min), 1,689,546,445 distinct states found (1,971,722 ds/min), 187,236,923 states left on queue.
+Progress(44) at 2024-11-07 03:42:07: 21,380,557,165 states generated (29,160,485 s/min), 1,691,587,169 distinct states found (2,040,724 ds/min), 187,186,480 states left on queue.
+Progress(44) at 2024-11-07 03:43:07: 21,409,627,333 states generated (29,070,168 s/min), 1,693,246,645 distinct states found (1,659,476 ds/min), 186,839,410 states left on queue.
+Progress(44) at 2024-11-07 03:44:07: 21,438,692,500 states generated (29,065,167 s/min), 1,695,162,088 distinct states found (1,915,443 ds/min), 186,763,843 states left on queue.
+Progress(44) at 2024-11-07 03:45:07: 21,467,558,980 states generated (28,866,480 s/min), 1,697,105,328 distinct states found (1,943,240 ds/min), 186,647,091 states left on queue.
+Progress(44) at 2024-11-07 03:46:07: 21,496,459,596 states generated (28,900,616 s/min), 1,698,987,134 distinct states found (1,881,806 ds/min), 186,428,411 states left on queue.
+Progress(44) at 2024-11-07 03:47:07: 21,525,539,564 states generated (29,079,968 s/min), 1,700,685,335 distinct states found (1,698,201 ds/min), 186,176,831 states left on queue.
+Progress(44) at 2024-11-07 03:48:07: 21,554,716,115 states generated (29,176,551 s/min), 1,702,193,633 distinct states found (1,508,298 ds/min), 185,811,852 states left on queue.
+Progress(44) at 2024-11-07 03:49:07: 21,583,930,332 states generated (29,214,217 s/min), 1,703,965,186 distinct states found (1,771,553 ds/min), 185,645,122 states left on queue.
+Progress(44) at 2024-11-07 03:50:07: 21,612,870,304 states generated (28,939,972 s/min), 1,705,581,017 distinct states found (1,615,831 ds/min), 185,385,482 states left on queue.
+Progress(44) at 2024-11-07 03:51:07: 21,641,828,993 states generated (28,958,689 s/min), 1,707,209,695 distinct states found (1,628,678 ds/min), 185,147,878 states left on queue.
+Progress(44) at 2024-11-07 03:52:07: 21,670,879,227 states generated (29,050,234 s/min), 1,708,891,056 distinct states found (1,681,361 ds/min), 184,967,950 states left on queue.
+Progress(44) at 2024-11-07 03:53:07: 21,700,175,853 states generated (29,296,626 s/min), 1,710,442,845 distinct states found (1,551,789 ds/min), 184,628,950 states left on queue.
+Progress(44) at 2024-11-07 03:54:07: 21,729,661,920 states generated (29,486,067 s/min), 1,712,360,375 distinct states found (1,917,530 ds/min), 184,602,047 states left on queue.
+Progress(44) at 2024-11-07 03:55:07: 21,759,015,470 states generated (29,353,550 s/min), 1,714,259,170 distinct states found (1,898,795 ds/min), 184,554,564 states left on queue.
+Progress(44) at 2024-11-07 03:56:07: 21,788,534,088 states generated (29,518,618 s/min), 1,716,081,999 distinct states found (1,822,829 ds/min), 184,406,994 states left on queue.
+Progress(44) at 2024-11-07 03:57:07: 21,817,875,474 states generated (29,341,386 s/min), 1,717,634,611 distinct states found (1,552,612 ds/min), 184,057,660 states left on queue.
+Progress(44) at 2024-11-07 03:58:07: 21,847,006,510 states generated (29,131,036 s/min), 1,719,299,741 distinct states found (1,665,130 ds/min), 183,828,258 states left on queue.
+Progress(44) at 2024-11-07 03:59:07: 21,875,869,357 states generated (28,862,847 s/min), 1,720,801,722 distinct states found (1,501,981 ds/min), 183,443,083 states left on queue.
+Progress(44) at 2024-11-07 04:00:07: 21,904,922,732 states generated (29,053,375 s/min), 1,722,588,504 distinct states found (1,786,782 ds/min), 183,289,094 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 04:01:07)
+Progress(44) at 2024-11-07 04:01:07: 21,933,965,695 states generated (29,042,963 s/min), 1,724,285,279 distinct states found (1,696,775 ds/min), 183,029,310 states left on queue.
+Progress(44) at 2024-11-07 04:02:07: 21,962,959,341 states generated (28,993,646 s/min), 1,725,868,213 distinct states found (1,582,934 ds/min), 182,699,155 states left on queue.
+Progress(44) at 2024-11-07 04:03:07: 21,991,777,816 states generated (28,818,475 s/min), 1,727,519,032 distinct states found (1,650,819 ds/min), 182,433,701 states left on queue.
+Progress(44) at 2024-11-07 04:04:07: 22,020,733,433 states generated (28,955,617 s/min), 1,729,219,503 distinct states found (1,700,471 ds/min), 182,216,615 states left on queue.
+Progress(44) at 2024-11-07 04:05:07: 22,049,984,634 states generated (29,251,201 s/min), 1,730,967,606 distinct states found (1,748,103 ds/min), 182,140,987 states left on queue.
+Progress(44) at 2024-11-07 04:06:07: 22,079,112,674 states generated (29,128,040 s/min), 1,732,648,368 distinct states found (1,680,762 ds/min), 181,963,576 states left on queue.
+Progress(44) at 2024-11-07 04:07:07: 22,108,329,917 states generated (29,217,243 s/min), 1,734,711,160 distinct states found (2,062,792 ds/min), 182,074,434 states left on queue.
+Progress(44) at 2024-11-07 04:08:07: 22,137,402,322 states generated (29,072,405 s/min), 1,736,773,111 distinct states found (2,061,951 ds/min), 182,163,318 states left on queue.
+Progress(44) at 2024-11-07 04:09:07: 22,166,402,243 states generated (28,999,921 s/min), 1,738,573,615 distinct states found (1,800,504 ds/min), 182,034,194 states left on queue.
+Progress(44) at 2024-11-07 04:10:07: 22,195,545,763 states generated (29,143,520 s/min), 1,740,349,901 distinct states found (1,776,286 ds/min), 181,869,339 states left on queue.
+Progress(44) at 2024-11-07 04:11:07: 22,224,766,309 states generated (29,220,546 s/min), 1,742,110,577 distinct states found (1,760,676 ds/min), 181,671,885 states left on queue.
+Progress(44) at 2024-11-07 04:12:07: 22,253,807,692 states generated (29,041,383 s/min), 1,743,796,752 distinct states found (1,686,175 ds/min), 181,407,584 states left on queue.
+Progress(44) at 2024-11-07 04:13:07: 22,282,790,947 states generated (28,983,255 s/min), 1,745,617,175 distinct states found (1,820,423 ds/min), 181,265,096 states left on queue.
+Progress(44) at 2024-11-07 04:14:07: 22,311,840,917 states generated (29,049,970 s/min), 1,747,424,658 distinct states found (1,807,483 ds/min), 181,110,335 states left on queue.
+Progress(44) at 2024-11-07 04:15:07: 22,340,851,116 states generated (29,010,199 s/min), 1,749,204,899 distinct states found (1,780,241 ds/min), 180,933,264 states left on queue.
+Progress(44) at 2024-11-07 04:16:07: 22,369,820,191 states generated (28,969,075 s/min), 1,751,058,290 distinct states found (1,853,391 ds/min), 180,819,450 states left on queue.
+Progress(44) at 2024-11-07 04:17:07: 22,398,637,854 states generated (28,817,663 s/min), 1,752,838,012 distinct states found (1,779,722 ds/min), 180,641,066 states left on queue.
+Progress(44) at 2024-11-07 04:18:07: 22,427,736,775 states generated (29,098,921 s/min), 1,754,678,716 distinct states found (1,840,704 ds/min), 180,523,907 states left on queue.
+Progress(44) at 2024-11-07 04:19:07: 22,456,749,604 states generated (29,012,829 s/min), 1,756,653,204 distinct states found (1,974,488 ds/min), 180,502,441 states left on queue.
+Progress(44) at 2024-11-07 04:20:07: 22,485,995,309 states generated (29,245,705 s/min), 1,758,406,219 distinct states found (1,753,015 ds/min), 180,303,710 states left on queue.
+Progress(44) at 2024-11-07 04:21:07: 22,515,059,607 states generated (29,064,298 s/min), 1,760,239,858 distinct states found (1,833,639 ds/min), 180,203,277 states left on queue.
+Progress(44) at 2024-11-07 04:22:07: 22,544,007,885 states generated (28,948,278 s/min), 1,761,871,023 distinct states found (1,631,165 ds/min), 179,919,396 states left on queue.
+Progress(44) at 2024-11-07 04:23:07: 22,572,858,704 states generated (28,850,819 s/min), 1,763,420,170 distinct states found (1,549,147 ds/min), 179,579,696 states left on queue.
+Progress(44) at 2024-11-07 04:24:07: 22,601,850,297 states generated (28,991,593 s/min), 1,765,118,103 distinct states found (1,697,933 ds/min), 179,386,571 states left on queue.
+Progress(44) at 2024-11-07 04:25:07: 22,630,832,111 states generated (28,981,814 s/min), 1,766,934,802 distinct states found (1,816,699 ds/min), 179,271,264 states left on queue.
+Progress(44) at 2024-11-07 04:26:07: 22,659,674,047 states generated (28,841,936 s/min), 1,768,697,425 distinct states found (1,762,623 ds/min), 179,093,059 states left on queue.
+Progress(44) at 2024-11-07 04:27:07: 22,688,427,580 states generated (28,753,533 s/min), 1,770,450,184 distinct states found (1,752,759 ds/min), 178,899,489 states left on queue.
+Progress(44) at 2024-11-07 04:28:07: 22,717,189,869 states generated (28,762,289 s/min), 1,772,256,239 distinct states found (1,806,055 ds/min), 178,731,640 states left on queue.
+Progress(44) at 2024-11-07 04:29:07: 22,746,022,343 states generated (28,832,474 s/min), 1,774,044,050 distinct states found (1,787,811 ds/min), 178,570,129 states left on queue.
+Progress(44) at 2024-11-07 04:30:07: 22,774,887,995 states generated (28,865,652 s/min), 1,775,840,059 distinct states found (1,796,009 ds/min), 178,368,886 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 04:31:08)
+Progress(44) at 2024-11-07 04:31:08: 22,803,877,345 states generated (28,989,350 s/min), 1,777,539,486 distinct states found (1,699,427 ds/min), 178,036,764 states left on queue.
+Progress(44) at 2024-11-07 04:32:08: 22,832,344,161 states generated (28,466,816 s/min), 1,779,071,939 distinct states found (1,532,453 ds/min), 177,609,496 states left on queue.
+Progress(44) at 2024-11-07 04:33:08: 22,860,965,708 states generated (28,621,547 s/min), 1,780,632,191 distinct states found (1,560,252 ds/min), 177,226,645 states left on queue.
+Progress(44) at 2024-11-07 04:34:08: 22,890,116,212 states generated (29,150,504 s/min), 1,782,192,671 distinct states found (1,560,480 ds/min), 176,856,967 states left on queue.
+Progress(44) at 2024-11-07 04:35:08: 22,919,394,798 states generated (29,278,586 s/min), 1,783,989,997 distinct states found (1,797,326 ds/min), 176,677,020 states left on queue.
+Progress(44) at 2024-11-07 04:36:08: 22,948,717,272 states generated (29,322,474 s/min), 1,785,769,628 distinct states found (1,779,631 ds/min), 176,466,304 states left on queue.
+Progress(44) at 2024-11-07 04:37:08: 22,978,008,874 states generated (29,291,602 s/min), 1,787,768,546 distinct states found (1,998,918 ds/min), 176,439,057 states left on queue.
+Progress(45) at 2024-11-07 04:38:08: 23,007,259,342 states generated (29,250,468 s/min), 1,789,652,180 distinct states found (1,883,634 ds/min), 176,335,868 states left on queue.
+Progress(45) at 2024-11-07 04:39:08: 23,036,414,234 states generated (29,154,892 s/min), 1,791,293,395 distinct states found (1,641,215 ds/min), 176,048,834 states left on queue.
+Progress(45) at 2024-11-07 04:40:08: 23,065,467,218 states generated (29,052,984 s/min), 1,793,176,180 distinct states found (1,882,785 ds/min), 175,945,068 states left on queue.
+Progress(45) at 2024-11-07 04:41:08: 23,094,601,413 states generated (29,134,195 s/min), 1,795,085,201 distinct states found (1,909,021 ds/min), 175,844,471 states left on queue.
+Progress(45) at 2024-11-07 04:42:08: 23,123,835,299 states generated (29,233,886 s/min), 1,796,998,629 distinct states found (1,913,428 ds/min), 175,751,026 states left on queue.
+Progress(45) at 2024-11-07 04:43:08: 23,153,014,383 states generated (29,179,084 s/min), 1,798,830,917 distinct states found (1,832,288 ds/min), 175,609,899 states left on queue.
+Progress(45) at 2024-11-07 04:44:08: 23,181,848,791 states generated (28,834,408 s/min), 1,800,688,969 distinct states found (1,858,052 ds/min), 175,482,089 states left on queue.
+Progress(45) at 2024-11-07 04:45:08: 23,210,960,242 states generated (29,111,451 s/min), 1,802,681,838 distinct states found (1,992,869 ds/min), 175,468,259 states left on queue.
+Progress(45) at 2024-11-07 04:46:08: 23,239,931,898 states generated (28,971,656 s/min), 1,804,527,297 distinct states found (1,845,459 ds/min), 175,314,676 states left on queue.
+Progress(45) at 2024-11-07 04:47:08: 23,269,110,236 states generated (29,178,338 s/min), 1,806,324,412 distinct states found (1,797,115 ds/min), 175,104,294 states left on queue.
+Progress(45) at 2024-11-07 04:48:08: 23,298,261,893 states generated (29,151,657 s/min), 1,808,026,372 distinct states found (1,701,960 ds/min), 174,789,761 states left on queue.
+Progress(45) at 2024-11-07 04:49:08: 23,327,194,301 states generated (28,932,408 s/min), 1,809,635,143 distinct states found (1,608,771 ds/min), 174,475,327 states left on queue.
+Progress(45) at 2024-11-07 04:50:08: 23,356,033,807 states generated (28,839,506 s/min), 1,811,533,685 distinct states found (1,898,542 ds/min), 174,375,697 states left on queue.
+Progress(45) at 2024-11-07 04:51:08: 23,384,783,950 states generated (28,750,143 s/min), 1,813,242,773 distinct states found (1,709,088 ds/min), 174,093,638 states left on queue.
+Progress(45) at 2024-11-07 04:52:08: 23,413,868,078 states generated (29,084,128 s/min), 1,814,921,217 distinct states found (1,678,444 ds/min), 173,816,375 states left on queue.
+Progress(45) at 2024-11-07 04:53:08: 23,443,072,326 states generated (29,204,248 s/min), 1,816,887,463 distinct states found (1,966,246 ds/min), 173,768,064 states left on queue.
+Progress(45) at 2024-11-07 04:54:08: 23,472,531,302 states generated (29,458,976 s/min), 1,818,893,389 distinct states found (2,005,926 ds/min), 173,736,986 states left on queue.
+Progress(45) at 2024-11-07 04:55:08: 23,501,670,169 states generated (29,138,867 s/min), 1,820,467,013 distinct states found (1,573,624 ds/min), 173,393,980 states left on queue.
+Progress(45) at 2024-11-07 04:56:08: 23,530,619,816 states generated (28,949,647 s/min), 1,822,153,389 distinct states found (1,686,376 ds/min), 173,102,476 states left on queue.
+Progress(45) at 2024-11-07 04:57:08: 23,559,730,839 states generated (29,111,023 s/min), 1,824,067,840 distinct states found (1,914,451 ds/min), 173,045,910 states left on queue.
+Progress(45) at 2024-11-07 04:58:08: 23,588,956,543 states generated (29,225,704 s/min), 1,826,128,132 distinct states found (2,060,292 ds/min), 173,097,456 states left on queue.
+Progress(45) at 2024-11-07 04:59:08: 23,617,943,385 states generated (28,986,842 s/min), 1,828,156,857 distinct states found (2,028,725 ds/min), 173,115,797 states left on queue.
+Progress(45) at 2024-11-07 05:00:08: 23,647,052,247 states generated (29,108,862 s/min), 1,830,116,296 distinct states found (1,959,439 ds/min), 173,061,677 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 05:01:09)
+Progress(45) at 2024-11-07 05:01:09: 23,676,540,644 states generated (29,488,397 s/min), 1,832,081,172 distinct states found (1,964,876 ds/min), 173,019,523 states left on queue.
+Progress(45) at 2024-11-07 05:02:09: 23,705,447,239 states generated (28,906,595 s/min), 1,834,157,962 distinct states found (2,076,790 ds/min), 173,069,444 states left on queue.
+Progress(45) at 2024-11-07 05:03:09: 23,734,590,381 states generated (29,143,142 s/min), 1,836,148,599 distinct states found (1,990,637 ds/min), 173,037,041 states left on queue.
+Progress(45) at 2024-11-07 05:04:09: 23,763,605,229 states generated (29,014,848 s/min), 1,838,302,051 distinct states found (2,153,452 ds/min), 173,135,339 states left on queue.
+Progress(45) at 2024-11-07 05:05:09: 23,792,794,847 states generated (29,189,618 s/min), 1,840,318,078 distinct states found (2,016,027 ds/min), 173,064,676 states left on queue.
+Progress(45) at 2024-11-07 05:06:09: 23,821,711,411 states generated (28,916,564 s/min), 1,842,248,819 distinct states found (1,930,741 ds/min), 172,938,116 states left on queue.
+Progress(45) at 2024-11-07 05:07:09: 23,850,829,522 states generated (29,118,111 s/min), 1,844,084,520 distinct states found (1,835,701 ds/min), 172,779,569 states left on queue.
+Progress(45) at 2024-11-07 05:08:09: 23,880,027,055 states generated (29,197,533 s/min), 1,846,207,907 distinct states found (2,123,387 ds/min), 172,876,875 states left on queue.
+Progress(45) at 2024-11-07 05:09:09: 23,909,238,654 states generated (29,211,599 s/min), 1,848,275,162 distinct states found (2,067,255 ds/min), 172,917,710 states left on queue.
+Progress(45) at 2024-11-07 05:10:09: 23,938,254,527 states generated (29,015,873 s/min), 1,850,062,508 distinct states found (1,787,346 ds/min), 172,709,939 states left on queue.
+Progress(45) at 2024-11-07 05:11:09: 23,967,280,908 states generated (29,026,381 s/min), 1,851,840,844 distinct states found (1,778,336 ds/min), 172,472,809 states left on queue.
+Progress(45) at 2024-11-07 05:12:09: 23,996,137,153 states generated (28,856,245 s/min), 1,853,907,711 distinct states found (2,066,867 ds/min), 172,514,422 states left on queue.
+Progress(45) at 2024-11-07 05:13:09: 24,025,003,271 states generated (28,866,118 s/min), 1,855,881,596 distinct states found (1,973,885 ds/min), 172,410,581 states left on queue.
+Progress(45) at 2024-11-07 05:14:09: 24,053,998,968 states generated (28,995,697 s/min), 1,857,730,142 distinct states found (1,848,546 ds/min), 172,259,468 states left on queue.
+Progress(45) at 2024-11-07 05:15:09: 24,082,780,775 states generated (28,781,807 s/min), 1,859,612,879 distinct states found (1,882,737 ds/min), 172,097,889 states left on queue.
+Progress(45) at 2024-11-07 05:16:09: 24,111,843,462 states generated (29,062,687 s/min), 1,861,479,353 distinct states found (1,866,474 ds/min), 171,938,834 states left on queue.
+Progress(45) at 2024-11-07 05:17:09: 24,140,987,153 states generated (29,143,691 s/min), 1,863,390,493 distinct states found (1,911,140 ds/min), 171,786,752 states left on queue.
+Progress(45) at 2024-11-07 05:18:09: 24,170,023,897 states generated (29,036,744 s/min), 1,864,965,603 distinct states found (1,575,110 ds/min), 171,386,848 states left on queue.
+Progress(45) at 2024-11-07 05:19:09: 24,198,987,772 states generated (28,963,875 s/min), 1,866,820,638 distinct states found (1,855,035 ds/min), 171,238,575 states left on queue.
+Progress(45) at 2024-11-07 05:20:09: 24,227,820,740 states generated (28,832,968 s/min), 1,868,623,853 distinct states found (1,803,215 ds/min), 171,005,974 states left on queue.
+Progress(45) at 2024-11-07 05:21:09: 24,256,712,636 states generated (28,891,896 s/min), 1,870,265,139 distinct states found (1,641,286 ds/min), 170,619,838 states left on queue.
+Progress(45) at 2024-11-07 05:22:09: 24,285,792,587 states generated (29,079,951 s/min), 1,871,770,548 distinct states found (1,505,409 ds/min), 170,247,019 states left on queue.
+Progress(45) at 2024-11-07 05:23:09: 24,315,021,618 states generated (29,229,031 s/min), 1,873,433,426 distinct states found (1,662,878 ds/min), 169,986,497 states left on queue.
+Progress(45) at 2024-11-07 05:24:09: 24,343,972,976 states generated (28,951,358 s/min), 1,874,958,509 distinct states found (1,525,083 ds/min), 169,639,357 states left on queue.
+Progress(45) at 2024-11-07 05:25:09: 24,372,818,044 states generated (28,845,068 s/min), 1,876,461,909 distinct states found (1,503,400 ds/min), 169,298,313 states left on queue.
+Progress(45) at 2024-11-07 05:26:09: 24,401,879,839 states generated (29,061,795 s/min), 1,878,043,093 distinct states found (1,581,184 ds/min), 169,034,999 states left on queue.
+Progress(45) at 2024-11-07 05:27:09: 24,431,117,440 states generated (29,237,601 s/min), 1,879,528,913 distinct states found (1,485,820 ds/min), 168,669,766 states left on queue.
+Progress(45) at 2024-11-07 05:28:09: 24,460,565,564 states generated (29,448,124 s/min), 1,881,382,841 distinct states found (1,853,928 ds/min), 168,585,549 states left on queue.
+Progress(45) at 2024-11-07 05:29:09: 24,489,842,320 states generated (29,276,756 s/min), 1,883,163,526 distinct states found (1,780,685 ds/min), 168,440,866 states left on queue.
+Progress(45) at 2024-11-07 05:30:09: 24,519,309,785 states generated (29,467,465 s/min), 1,884,840,978 distinct states found (1,677,452 ds/min), 168,176,100 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 05:31:09)
+Progress(45) at 2024-11-07 05:31:09: 24,548,699,426 states generated (29,389,641 s/min), 1,886,346,733 distinct states found (1,505,755 ds/min), 167,794,030 states left on queue.
+Progress(45) at 2024-11-07 05:32:09: 24,577,454,860 states generated (28,755,434 s/min), 1,887,761,288 distinct states found (1,414,555 ds/min), 167,342,409 states left on queue.
+Progress(45) at 2024-11-07 05:33:09: 24,606,401,929 states generated (28,947,069 s/min), 1,889,451,503 distinct states found (1,690,215 ds/min), 167,115,718 states left on queue.
+Progress(45) at 2024-11-07 05:34:09: 24,635,080,181 states generated (28,678,252 s/min), 1,891,013,080 distinct states found (1,561,577 ds/min), 166,760,395 states left on queue.
+Progress(45) at 2024-11-07 05:35:09: 24,663,912,233 states generated (28,832,052 s/min), 1,892,486,967 distinct states found (1,473,887 ds/min), 166,347,547 states left on queue.
+Progress(45) at 2024-11-07 05:36:09: 24,692,601,003 states generated (28,688,770 s/min), 1,894,014,661 distinct states found (1,527,694 ds/min), 165,980,327 states left on queue.
+Progress(45) at 2024-11-07 05:37:09: 24,721,596,280 states generated (28,995,277 s/min), 1,895,667,269 distinct states found (1,652,608 ds/min), 165,766,132 states left on queue.
+Progress(45) at 2024-11-07 05:38:09: 24,750,737,270 states generated (29,140,990 s/min), 1,897,304,588 distinct states found (1,637,319 ds/min), 165,602,331 states left on queue.
+Progress(45) at 2024-11-07 05:39:09: 24,779,762,621 states generated (29,025,351 s/min), 1,898,944,557 distinct states found (1,639,969 ds/min), 165,399,097 states left on queue.
+Progress(45) at 2024-11-07 05:40:09: 24,808,890,636 states generated (29,128,015 s/min), 1,901,039,200 distinct states found (2,094,643 ds/min), 165,505,866 states left on queue.
+Progress(45) at 2024-11-07 05:41:09: 24,837,834,330 states generated (28,943,694 s/min), 1,902,825,947 distinct states found (1,786,747 ds/min), 165,385,690 states left on queue.
+Progress(45) at 2024-11-07 05:42:09: 24,866,749,194 states generated (28,914,864 s/min), 1,904,509,048 distinct states found (1,683,101 ds/min), 165,143,394 states left on queue.
+Progress(45) at 2024-11-07 05:43:09: 24,895,891,462 states generated (29,142,268 s/min), 1,906,186,633 distinct states found (1,677,585 ds/min), 164,907,199 states left on queue.
+Progress(45) at 2024-11-07 05:44:09: 24,924,929,592 states generated (29,038,130 s/min), 1,907,774,010 distinct states found (1,587,377 ds/min), 164,567,256 states left on queue.
+Progress(45) at 2024-11-07 05:45:09: 24,953,854,731 states generated (28,925,139 s/min), 1,909,438,393 distinct states found (1,664,383 ds/min), 164,297,435 states left on queue.
+Progress(45) at 2024-11-07 05:46:09: 24,982,773,173 states generated (28,918,442 s/min), 1,911,115,370 distinct states found (1,676,977 ds/min), 164,029,981 states left on queue.
+Progress(45) at 2024-11-07 05:47:09: 25,011,681,639 states generated (28,908,466 s/min), 1,912,739,102 distinct states found (1,623,732 ds/min), 163,722,709 states left on queue.
+Progress(45) at 2024-11-07 05:48:09: 25,040,624,886 states generated (28,943,247 s/min), 1,914,465,220 distinct states found (1,726,118 ds/min), 163,504,979 states left on queue.
+Progress(45) at 2024-11-07 05:49:09: 25,069,369,631 states generated (28,744,745 s/min), 1,916,123,524 distinct states found (1,658,304 ds/min), 163,227,016 states left on queue.
+Progress(45) at 2024-11-07 05:50:09: 25,098,381,973 states generated (29,012,342 s/min), 1,917,856,454 distinct states found (1,732,930 ds/min), 163,020,213 states left on queue.
+Progress(45) at 2024-11-07 05:51:09: 25,127,432,010 states generated (29,050,037 s/min), 1,919,715,623 distinct states found (1,859,169 ds/min), 162,903,211 states left on queue.
+Progress(45) at 2024-11-07 05:52:09: 25,156,554,852 states generated (29,122,842 s/min), 1,921,381,482 distinct states found (1,665,859 ds/min), 162,640,342 states left on queue.
+Progress(45) at 2024-11-07 05:53:09: 25,185,439,752 states generated (28,884,900 s/min), 1,923,074,493 distinct states found (1,693,011 ds/min), 162,418,419 states left on queue.
+Progress(45) at 2024-11-07 05:54:09: 25,214,250,620 states generated (28,810,868 s/min), 1,924,599,166 distinct states found (1,524,673 ds/min), 162,035,736 states left on queue.
+Progress(45) at 2024-11-07 05:55:09: 25,243,065,684 states generated (28,815,064 s/min), 1,926,028,590 distinct states found (1,429,424 ds/min), 161,647,928 states left on queue.
+Progress(45) at 2024-11-07 05:56:09: 25,272,074,106 states generated (29,008,422 s/min), 1,927,788,924 distinct states found (1,760,334 ds/min), 161,469,066 states left on queue.
+Progress(45) at 2024-11-07 05:57:09: 25,300,916,527 states generated (28,842,421 s/min), 1,929,427,503 distinct states found (1,638,579 ds/min), 161,203,063 states left on queue.
+Progress(45) at 2024-11-07 05:58:09: 25,329,617,957 states generated (28,701,430 s/min), 1,931,016,200 distinct states found (1,588,697 ds/min), 160,883,828 states left on queue.
+Progress(45) at 2024-11-07 05:59:09: 25,358,305,874 states generated (28,687,917 s/min), 1,932,700,683 distinct states found (1,684,483 ds/min), 160,613,534 states left on queue.
+Progress(45) at 2024-11-07 06:00:09: 25,387,060,807 states generated (28,754,933 s/min), 1,934,352,908 distinct states found (1,652,225 ds/min), 160,340,594 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 06:01:10)
+Progress(45) at 2024-11-07 06:01:10: 25,416,167,383 states generated (29,106,576 s/min), 1,936,031,185 distinct states found (1,678,277 ds/min), 160,024,096 states left on queue.
+Progress(45) at 2024-11-07 06:02:10: 25,444,775,068 states generated (28,607,685 s/min), 1,937,531,864 distinct states found (1,500,679 ds/min), 159,558,759 states left on queue.
+Progress(45) at 2024-11-07 06:03:10: 25,473,218,014 states generated (28,442,946 s/min), 1,938,932,593 distinct states found (1,400,729 ds/min), 159,031,186 states left on queue.
+Progress(45) at 2024-11-07 06:04:10: 25,502,153,601 states generated (28,935,587 s/min), 1,940,366,906 distinct states found (1,434,313 ds/min), 158,550,067 states left on queue.
+Progress(45) at 2024-11-07 06:05:10: 25,531,409,924 states generated (29,256,323 s/min), 1,942,031,081 distinct states found (1,664,175 ds/min), 158,260,393 states left on queue.
+Progress(45) at 2024-11-07 06:06:10: 25,560,798,500 states generated (29,388,576 s/min), 1,943,755,697 distinct states found (1,724,616 ds/min), 158,001,838 states left on queue.
+Progress(45) at 2024-11-07 06:07:10: 25,590,101,236 states generated (29,302,736 s/min), 1,945,659,191 distinct states found (1,903,494 ds/min), 157,894,541 states left on queue.
+Progress(45) at 2024-11-07 06:08:10: 25,619,347,006 states generated (29,245,770 s/min), 1,947,436,584 distinct states found (1,777,393 ds/min), 157,703,839 states left on queue.
+Progress(45) at 2024-11-07 06:09:10: 25,648,466,795 states generated (29,119,789 s/min), 1,949,039,117 distinct states found (1,602,533 ds/min), 157,391,298 states left on queue.
+Progress(45) at 2024-11-07 06:10:10: 25,677,360,883 states generated (28,894,088 s/min), 1,950,787,656 distinct states found (1,748,539 ds/min), 157,176,854 states left on queue.
+Progress(45) at 2024-11-07 06:11:10: 25,706,625,655 states generated (29,264,772 s/min), 1,952,700,166 distinct states found (1,912,510 ds/min), 157,069,408 states left on queue.
+Progress(46) at 2024-11-07 06:12:10: 25,735,830,172 states generated (29,204,517 s/min), 1,954,444,069 distinct states found (1,743,903 ds/min), 156,852,227 states left on queue.
+Progress(46) at 2024-11-07 06:13:10: 25,764,811,792 states generated (28,981,620 s/min), 1,956,165,433 distinct states found (1,721,364 ds/min), 156,618,900 states left on queue.
+Progress(46) at 2024-11-07 06:14:10: 25,793,740,486 states generated (28,928,694 s/min), 1,957,961,862 distinct states found (1,796,429 ds/min), 156,441,787 states left on queue.
+Progress(46) at 2024-11-07 06:15:10: 25,822,741,831 states generated (29,001,345 s/min), 1,959,749,416 distinct states found (1,787,554 ds/min), 156,253,838 states left on queue.
+Progress(46) at 2024-11-07 06:16:10: 25,851,804,688 states generated (29,062,857 s/min), 1,961,466,422 distinct states found (1,717,006 ds/min), 155,977,351 states left on queue.
+Progress(46) at 2024-11-07 06:17:10: 25,880,868,584 states generated (29,063,896 s/min), 1,963,090,742 distinct states found (1,624,320 ds/min), 155,628,145 states left on queue.
+Progress(46) at 2024-11-07 06:18:10: 25,909,824,307 states generated (28,955,723 s/min), 1,964,570,100 distinct states found (1,479,358 ds/min), 155,182,107 states left on queue.
+Progress(46) at 2024-11-07 06:19:10: 25,938,584,425 states generated (28,760,118 s/min), 1,966,303,642 distinct states found (1,733,542 ds/min), 154,946,766 states left on queue.
+Progress(46) at 2024-11-07 06:20:10: 25,967,304,223 states generated (28,719,798 s/min), 1,967,883,207 distinct states found (1,579,565 ds/min), 154,558,935 states left on queue.
+Progress(46) at 2024-11-07 06:21:10: 25,996,402,469 states generated (29,098,246 s/min), 1,969,591,000 distinct states found (1,707,793 ds/min), 154,302,069 states left on queue.
+Progress(46) at 2024-11-07 06:22:10: 26,025,623,943 states generated (29,221,474 s/min), 1,971,434,403 distinct states found (1,843,403 ds/min), 154,157,059 states left on queue.
+Progress(46) at 2024-11-07 06:23:10: 26,055,038,054 states generated (29,414,111 s/min), 1,973,261,720 distinct states found (1,827,317 ds/min), 153,981,317 states left on queue.
+Progress(46) at 2024-11-07 06:24:10: 26,083,986,220 states generated (28,948,166 s/min), 1,974,670,648 distinct states found (1,408,928 ds/min), 153,508,388 states left on queue.
+Progress(46) at 2024-11-07 06:25:10: 26,113,067,907 states generated (29,081,687 s/min), 1,976,391,547 distinct states found (1,720,899 ds/min), 153,263,845 states left on queue.
+Progress(46) at 2024-11-07 06:26:10: 26,142,186,839 states generated (29,118,932 s/min), 1,978,379,881 distinct states found (1,988,334 ds/min), 153,253,200 states left on queue.
+Progress(46) at 2024-11-07 06:27:10: 26,171,338,068 states generated (29,151,229 s/min), 1,980,293,569 distinct states found (1,913,688 ds/min), 153,185,559 states left on queue.
+Progress(46) at 2024-11-07 06:28:10: 26,200,319,869 states generated (28,981,801 s/min), 1,982,130,034 distinct states found (1,836,465 ds/min), 153,039,826 states left on queue.
+Progress(46) at 2024-11-07 06:29:10: 26,229,451,237 states generated (29,131,368 s/min), 1,984,117,981 distinct states found (1,987,947 ds/min), 153,030,792 states left on queue.
+Progress(46) at 2024-11-07 06:30:10: 26,258,476,767 states generated (29,025,530 s/min), 1,985,981,073 distinct states found (1,863,092 ds/min), 152,917,939 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 06:31:11)
+Progress(46) at 2024-11-07 06:31:11: 26,287,657,848 states generated (29,181,081 s/min), 1,987,875,178 distinct states found (1,894,105 ds/min), 152,784,901 states left on queue.
+Progress(46) at 2024-11-07 06:32:11: 26,316,549,803 states generated (28,891,955 s/min), 1,989,821,141 distinct states found (1,945,963 ds/min), 152,728,813 states left on queue.
+Progress(46) at 2024-11-07 06:33:11: 26,345,570,902 states generated (29,021,099 s/min), 1,991,762,973 distinct states found (1,941,832 ds/min), 152,648,662 states left on queue.
+Progress(46) at 2024-11-07 06:34:11: 26,374,519,051 states generated (28,948,149 s/min), 1,993,605,958 distinct states found (1,842,985 ds/min), 152,446,201 states left on queue.
+Progress(46) at 2024-11-07 06:35:11: 26,403,403,284 states generated (28,884,233 s/min), 1,995,379,328 distinct states found (1,773,370 ds/min), 152,189,032 states left on queue.
+Progress(46) at 2024-11-07 06:36:11: 26,432,512,518 states generated (29,109,234 s/min), 1,997,205,848 distinct states found (1,826,520 ds/min), 152,060,823 states left on queue.
+Progress(46) at 2024-11-07 06:37:11: 26,461,635,963 states generated (29,123,445 s/min), 1,999,221,288 distinct states found (2,015,440 ds/min), 152,052,317 states left on queue.
+Progress(46) at 2024-11-07 06:38:11: 26,490,692,408 states generated (29,056,445 s/min), 2,001,003,940 distinct states found (1,782,652 ds/min), 151,869,333 states left on queue.
+Progress(46) at 2024-11-07 06:39:11: 26,519,611,691 states generated (28,919,283 s/min), 2,002,772,264 distinct states found (1,768,324 ds/min), 151,637,576 states left on queue.
+Progress(46) at 2024-11-07 06:40:11: 26,548,405,773 states generated (28,794,082 s/min), 2,004,530,832 distinct states found (1,758,568 ds/min), 151,415,653 states left on queue.
+Progress(46) at 2024-11-07 06:41:11: 26,577,168,173 states generated (28,762,400 s/min), 2,006,431,383 distinct states found (1,900,551 ds/min), 151,293,655 states left on queue.
+Progress(46) at 2024-11-07 06:42:11: 26,606,013,565 states generated (28,845,392 s/min), 2,008,118,930 distinct states found (1,687,547 ds/min), 150,979,607 states left on queue.
+Progress(46) at 2024-11-07 06:43:11: 26,634,840,454 states generated (28,826,889 s/min), 2,010,033,233 distinct states found (1,914,303 ds/min), 150,859,233 states left on queue.
+Progress(46) at 2024-11-07 06:44:11: 26,663,791,564 states generated (28,951,110 s/min), 2,011,764,506 distinct states found (1,731,273 ds/min), 150,592,176 states left on queue.
+Progress(46) at 2024-11-07 06:45:11: 26,692,845,560 states generated (29,053,996 s/min), 2,013,541,948 distinct states found (1,777,442 ds/min), 150,346,125 states left on queue.
+Progress(46) at 2024-11-07 06:46:11: 26,721,838,462 states generated (28,992,902 s/min), 2,015,055,311 distinct states found (1,513,363 ds/min), 149,898,025 states left on queue.
+Progress(46) at 2024-11-07 06:47:11: 26,750,784,724 states generated (28,946,262 s/min), 2,016,795,791 distinct states found (1,740,480 ds/min), 149,636,143 states left on queue.
+Progress(46) at 2024-11-07 06:48:11: 26,779,537,729 states generated (28,753,005 s/min), 2,018,420,817 distinct states found (1,625,026 ds/min), 149,264,338 states left on queue.
+Progress(46) at 2024-11-07 06:49:11: 26,808,414,064 states generated (28,876,335 s/min), 2,019,941,133 distinct states found (1,520,316 ds/min), 148,833,851 states left on queue.
+Progress(46) at 2024-11-07 06:50:11: 26,837,552,895 states generated (29,138,831 s/min), 2,021,402,334 distinct states found (1,461,201 ds/min), 148,423,082 states left on queue.
+Progress(46) at 2024-11-07 06:51:11: 26,866,488,521 states generated (28,935,626 s/min), 2,022,896,299 distinct states found (1,493,965 ds/min), 148,037,640 states left on queue.
+Progress(46) at 2024-11-07 06:52:11: 26,895,259,654 states generated (28,771,133 s/min), 2,024,306,180 distinct states found (1,409,881 ds/min), 147,623,626 states left on queue.
+Progress(46) at 2024-11-07 06:53:11: 26,924,324,639 states generated (29,064,985 s/min), 2,025,751,691 distinct states found (1,445,511 ds/min), 147,237,191 states left on queue.
+Progress(46) at 2024-11-07 06:54:11: 26,953,575,306 states generated (29,250,667 s/min), 2,027,292,041 distinct states found (1,540,350 ds/min), 146,929,253 states left on queue.
+Progress(46) at 2024-11-07 06:55:11: 26,982,863,734 states generated (29,288,428 s/min), 2,029,056,116 distinct states found (1,764,075 ds/min), 146,774,179 states left on queue.
+Progress(46) at 2024-11-07 06:56:11: 27,012,217,899 states generated (29,354,165 s/min), 2,030,705,091 distinct states found (1,648,975 ds/min), 146,523,776 states left on queue.
+Progress(46) at 2024-11-07 06:57:11: 27,041,431,406 states generated (29,213,507 s/min), 2,032,122,917 distinct states found (1,417,826 ds/min), 146,066,712 states left on queue.
+Progress(46) at 2024-11-07 06:58:11: 27,070,230,233 states generated (28,798,827 s/min), 2,033,502,867 distinct states found (1,379,950 ds/min), 145,580,465 states left on queue.
+Progress(46) at 2024-11-07 06:59:11: 27,099,119,410 states generated (28,889,177 s/min), 2,035,080,295 distinct states found (1,577,428 ds/min), 145,255,429 states left on queue.
+Progress(46) at 2024-11-07 07:00:11: 27,127,802,546 states generated (28,683,136 s/min), 2,036,480,069 distinct states found (1,399,774 ds/min), 144,763,326 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 07:01:11)
+Progress(46) at 2024-11-07 07:01:11: 27,156,729,000 states generated (28,926,454 s/min), 2,037,888,171 distinct states found (1,408,102 ds/min), 144,282,188 states left on queue.
+Progress(46) at 2024-11-07 07:02:11: 27,185,673,878 states generated (28,944,878 s/min), 2,039,404,499 distinct states found (1,516,328 ds/min), 143,933,899 states left on queue.
+Progress(46) at 2024-11-07 07:03:11: 27,214,800,380 states generated (29,126,502 s/min), 2,040,991,907 distinct states found (1,587,408 ds/min), 143,736,528 states left on queue.
+Progress(46) at 2024-11-07 07:04:11: 27,243,805,336 states generated (29,004,956 s/min), 2,042,560,493 distinct states found (1,568,586 ds/min), 143,474,607 states left on queue.
+Progress(46) at 2024-11-07 07:05:11: 27,272,912,902 states generated (29,107,566 s/min), 2,044,549,687 distinct states found (1,989,194 ds/min), 143,501,934 states left on queue.
+Progress(46) at 2024-11-07 07:06:11: 27,301,850,628 states generated (28,937,726 s/min), 2,046,213,816 distinct states found (1,664,129 ds/min), 143,280,971 states left on queue.
+Progress(46) at 2024-11-07 07:07:11: 27,330,744,799 states generated (28,894,171 s/min), 2,047,777,121 distinct states found (1,563,305 ds/min), 142,943,602 states left on queue.
+Progress(46) at 2024-11-07 07:08:11: 27,359,855,477 states generated (29,110,678 s/min), 2,049,356,015 distinct states found (1,578,894 ds/min), 142,631,188 states left on queue.
+Progress(46) at 2024-11-07 07:09:11: 27,388,745,464 states generated (28,889,987 s/min), 2,050,822,496 distinct states found (1,466,481 ds/min), 142,190,439 states left on queue.
+Progress(46) at 2024-11-07 07:10:11: 27,417,576,550 states generated (28,831,086 s/min), 2,052,379,523 distinct states found (1,557,027 ds/min), 141,821,153 states left on queue.
+Progress(46) at 2024-11-07 07:11:11: 27,446,546,405 states generated (28,969,855 s/min), 2,053,934,499 distinct states found (1,554,976 ds/min), 141,462,097 states left on queue.
+Progress(46) at 2024-11-07 07:12:11: 27,475,398,683 states generated (28,852,278 s/min), 2,055,510,649 distinct states found (1,576,150 ds/min), 141,116,110 states left on queue.
+Progress(46) at 2024-11-07 07:13:11: 27,504,113,194 states generated (28,714,511 s/min), 2,057,051,677 distinct states found (1,541,028 ds/min), 140,743,906 states left on queue.
+Progress(46) at 2024-11-07 07:14:11: 27,532,983,174 states generated (28,869,980 s/min), 2,058,669,649 distinct states found (1,617,972 ds/min), 140,436,853 states left on queue.
+Progress(46) at 2024-11-07 07:15:11: 27,562,088,285 states generated (29,105,111 s/min), 2,060,404,146 distinct states found (1,734,497 ds/min), 140,213,296 states left on queue.
+Progress(46) at 2024-11-07 07:16:11: 27,591,079,273 states generated (28,990,988 s/min), 2,061,979,907 distinct states found (1,575,761 ds/min), 139,895,056 states left on queue.
+Progress(46) at 2024-11-07 07:17:11: 27,619,876,413 states generated (28,797,140 s/min), 2,063,482,225 distinct states found (1,502,318 ds/min), 139,506,174 states left on queue.
+Progress(46) at 2024-11-07 07:18:11: 27,648,595,649 states generated (28,719,236 s/min), 2,064,847,355 distinct states found (1,365,130 ds/min), 139,035,783 states left on queue.
+Progress(46) at 2024-11-07 07:19:11: 27,677,544,192 states generated (28,948,543 s/min), 2,066,507,355 distinct states found (1,660,000 ds/min), 138,783,592 states left on queue.
+Progress(46) at 2024-11-07 07:20:11: 27,706,306,461 states generated (28,762,269 s/min), 2,068,019,192 distinct states found (1,511,837 ds/min), 138,418,256 states left on queue.
+Progress(46) at 2024-11-07 07:21:11: 27,734,873,733 states generated (28,567,272 s/min), 2,069,467,142 distinct states found (1,447,950 ds/min), 137,977,630 states left on queue.
+Progress(46) at 2024-11-07 07:22:11: 27,763,678,204 states generated (28,804,471 s/min), 2,071,034,824 distinct states found (1,567,682 ds/min), 137,622,296 states left on queue.
+Progress(46) at 2024-11-07 07:23:11: 27,792,322,332 states generated (28,644,128 s/min), 2,072,586,226 distinct states found (1,551,402 ds/min), 137,231,762 states left on queue.
+Progress(46) at 2024-11-07 07:24:11: 27,821,040,127 states generated (28,717,795 s/min), 2,074,030,831 distinct states found (1,444,605 ds/min), 136,731,600 states left on queue.
+Progress(46) at 2024-11-07 07:25:11: 27,849,404,654 states generated (28,364,527 s/min), 2,075,273,409 distinct states found (1,242,578 ds/min), 136,082,131 states left on queue.
+Progress(46) at 2024-11-07 07:26:11: 27,878,356,417 states generated (28,951,763 s/min), 2,076,656,601 distinct states found (1,383,192 ds/min), 135,570,796 states left on queue.
+Progress(46) at 2024-11-07 07:27:11: 27,907,776,802 states generated (29,420,385 s/min), 2,078,383,391 distinct states found (1,726,790 ds/min), 135,306,248 states left on queue.
+Progress(46) at 2024-11-07 07:28:11: 27,937,070,294 states generated (29,293,492 s/min), 2,080,076,828 distinct states found (1,693,437 ds/min), 135,034,380 states left on queue.
+Progress(46) at 2024-11-07 07:29:11: 27,966,287,907 states generated (29,217,613 s/min), 2,081,855,223 distinct states found (1,778,395 ds/min), 134,839,763 states left on queue.
+Progress(46) at 2024-11-07 07:30:11: 27,995,330,759 states generated (29,042,852 s/min), 2,083,372,197 distinct states found (1,516,974 ds/min), 134,461,641 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 07:31:12)
+Progress(46) at 2024-11-07 07:31:12: 28,024,387,579 states generated (29,056,820 s/min), 2,085,018,193 distinct states found (1,645,996 ds/min), 134,150,126 states left on queue.
+Progress(46) at 2024-11-07 07:32:12: 28,053,564,379 states generated (29,176,800 s/min), 2,086,850,158 distinct states found (1,831,965 ds/min), 133,983,084 states left on queue.
+Progress(46) at 2024-11-07 07:33:12: 28,082,556,747 states generated (28,992,368 s/min), 2,088,444,271 distinct states found (1,594,113 ds/min), 133,651,269 states left on queue.
+Progress(47) at 2024-11-07 07:34:12: 28,111,323,007 states generated (28,766,260 s/min), 2,090,072,790 distinct states found (1,628,519 ds/min), 133,350,640 states left on queue.
+Progress(47) at 2024-11-07 07:35:12: 28,140,191,163 states generated (28,868,156 s/min), 2,091,740,224 distinct states found (1,667,434 ds/min), 133,070,266 states left on queue.
+Progress(47) at 2024-11-07 07:36:12: 28,169,054,601 states generated (28,863,438 s/min), 2,093,375,975 distinct states found (1,635,751 ds/min), 132,752,319 states left on queue.
+Progress(47) at 2024-11-07 07:37:12: 28,197,994,162 states generated (28,939,561 s/min), 2,094,929,793 distinct states found (1,553,818 ds/min), 132,356,738 states left on queue.
+Progress(47) at 2024-11-07 07:38:12: 28,226,808,491 states generated (28,814,329 s/min), 2,096,311,441 distinct states found (1,381,648 ds/min), 131,832,292 states left on queue.
+Progress(47) at 2024-11-07 07:39:12: 28,255,451,016 states generated (28,642,525 s/min), 2,097,907,185 distinct states found (1,595,744 ds/min), 131,487,862 states left on queue.
+Progress(47) at 2024-11-07 07:40:12: 28,284,015,286 states generated (28,564,270 s/min), 2,099,332,452 distinct states found (1,425,267 ds/min), 130,982,897 states left on queue.
+Progress(47) at 2024-11-07 07:41:12: 28,313,051,806 states generated (29,036,520 s/min), 2,101,053,792 distinct states found (1,721,340 ds/min), 130,744,522 states left on queue.
+Progress(47) at 2024-11-07 07:42:12: 28,342,348,160 states generated (29,296,354 s/min), 2,102,778,970 distinct states found (1,725,178 ds/min), 130,505,777 states left on queue.
+Progress(47) at 2024-11-07 07:43:12: 28,371,533,935 states generated (29,185,775 s/min), 2,104,337,778 distinct states found (1,558,808 ds/min), 130,144,304 states left on queue.
+Progress(47) at 2024-11-07 07:44:12: 28,400,351,066 states generated (28,817,131 s/min), 2,105,835,284 distinct states found (1,497,506 ds/min), 129,719,871 states left on queue.
+Progress(47) at 2024-11-07 07:45:12: 28,429,411,463 states generated (29,060,397 s/min), 2,107,704,752 distinct states found (1,869,468 ds/min), 129,618,749 states left on queue.
+Progress(47) at 2024-11-07 07:46:12: 28,458,488,093 states generated (29,076,630 s/min), 2,109,483,825 distinct states found (1,779,073 ds/min), 129,439,723 states left on queue.
+Progress(47) at 2024-11-07 07:47:12: 28,487,338,391 states generated (28,850,298 s/min), 2,111,230,358 distinct states found (1,746,533 ds/min), 129,232,124 states left on queue.
+Progress(47) at 2024-11-07 07:48:12: 28,516,411,931 states generated (29,073,540 s/min), 2,113,150,785 distinct states found (1,920,427 ds/min), 129,168,385 states left on queue.
+Progress(47) at 2024-11-07 07:49:12: 28,545,299,037 states generated (28,887,106 s/min), 2,114,878,071 distinct states found (1,727,286 ds/min), 128,948,735 states left on queue.
+Progress(47) at 2024-11-07 07:50:12: 28,574,186,091 states generated (28,887,054 s/min), 2,116,622,746 distinct states found (1,744,675 ds/min), 128,711,386 states left on queue.
+Progress(47) at 2024-11-07 07:51:12: 28,603,057,442 states generated (28,871,351 s/min), 2,118,435,710 distinct states found (1,812,964 ds/min), 128,543,573 states left on queue.
+Progress(47) at 2024-11-07 07:52:12: 28,632,042,720 states generated (28,985,278 s/min), 2,120,240,818 distinct states found (1,805,108 ds/min), 128,349,742 states left on queue.
+Progress(47) at 2024-11-07 07:53:12: 28,660,885,097 states generated (28,842,377 s/min), 2,121,904,885 distinct states found (1,664,067 ds/min), 128,002,987 states left on queue.
+Progress(47) at 2024-11-07 07:54:12: 28,689,690,902 states generated (28,805,805 s/min), 2,123,498,767 distinct states found (1,593,882 ds/min), 127,622,035 states left on queue.
+Progress(47) at 2024-11-07 07:55:12: 28,718,827,206 states generated (29,136,304 s/min), 2,125,375,087 distinct states found (1,876,320 ds/min), 127,518,682 states left on queue.
+Progress(47) at 2024-11-07 07:56:12: 28,747,988,287 states generated (29,161,081 s/min), 2,127,234,055 distinct states found (1,858,968 ds/min), 127,390,123 states left on queue.
+Progress(47) at 2024-11-07 07:57:12: 28,776,918,449 states generated (28,930,162 s/min), 2,128,896,639 distinct states found (1,662,584 ds/min), 127,099,202 states left on queue.
+Progress(47) at 2024-11-07 07:58:12: 28,805,826,521 states generated (28,908,072 s/min), 2,130,485,896 distinct states found (1,589,257 ds/min), 126,731,846 states left on queue.
+Progress(47) at 2024-11-07 07:59:12: 28,834,550,061 states generated (28,723,540 s/min), 2,132,267,049 distinct states found (1,781,153 ds/min), 126,524,859 states left on queue.
+Progress(47) at 2024-11-07 08:00:12: 28,863,218,037 states generated (28,667,976 s/min), 2,133,901,471 distinct states found (1,634,422 ds/min), 126,149,810 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 08:01:13)
+Progress(47) at 2024-11-07 08:01:13: 28,892,405,277 states generated (29,187,240 s/min), 2,135,683,266 distinct states found (1,781,795 ds/min), 125,938,046 states left on queue.
+Progress(47) at 2024-11-07 08:02:13: 28,921,188,007 states generated (28,782,730 s/min), 2,137,299,589 distinct states found (1,616,323 ds/min), 125,575,223 states left on queue.
+Progress(47) at 2024-11-07 08:03:13: 28,950,198,581 states generated (29,010,574 s/min), 2,138,945,715 distinct states found (1,646,126 ds/min), 125,225,825 states left on queue.
+Progress(47) at 2024-11-07 08:04:13: 28,979,052,322 states generated (28,853,741 s/min), 2,140,384,312 distinct states found (1,438,597 ds/min), 124,739,890 states left on queue.
+Progress(47) at 2024-11-07 08:05:13: 29,007,862,556 states generated (28,810,234 s/min), 2,142,020,690 distinct states found (1,636,378 ds/min), 124,389,570 states left on queue.
+Progress(47) at 2024-11-07 08:06:13: 29,036,639,997 states generated (28,777,441 s/min), 2,143,436,769 distinct states found (1,416,079 ds/min), 123,853,456 states left on queue.
+Progress(47) at 2024-11-07 08:07:13: 29,065,681,489 states generated (29,041,492 s/min), 2,144,841,718 distinct states found (1,404,949 ds/min), 123,385,042 states left on queue.
+Progress(47) at 2024-11-07 08:08:13: 29,094,462,032 states generated (28,780,543 s/min), 2,146,214,867 distinct states found (1,373,149 ds/min), 122,908,921 states left on queue.
+Progress(47) at 2024-11-07 08:09:13: 29,123,289,758 states generated (28,827,726 s/min), 2,147,553,984 distinct states found (1,339,117 ds/min), 122,446,193 states left on queue.
+Progress(47) at 2024-11-07 08:10:13: 29,152,503,386 states generated (29,213,628 s/min), 2,148,942,911 distinct states found (1,388,927 ds/min), 122,030,640 states left on queue.
+Progress(47) at 2024-11-07 08:11:13: 29,181,728,737 states generated (29,225,351 s/min), 2,150,631,619 distinct states found (1,688,708 ds/min), 121,816,919 states left on queue.
+Progress(47) at 2024-11-07 08:12:13: 29,211,003,478 states generated (29,274,741 s/min), 2,152,175,254 distinct states found (1,543,635 ds/min), 121,489,774 states left on queue.
+Progress(47) at 2024-11-07 08:13:13: 29,240,102,268 states generated (29,098,790 s/min), 2,153,537,952 distinct states found (1,362,698 ds/min), 120,992,206 states left on queue.
+Progress(47) at 2024-11-07 08:14:13: 29,268,843,458 states generated (28,741,190 s/min), 2,154,896,522 distinct states found (1,358,570 ds/min), 120,481,830 states left on queue.
+Progress(47) at 2024-11-07 08:15:13: 29,297,458,982 states generated (28,615,524 s/min), 2,156,228,693 distinct states found (1,332,171 ds/min), 119,935,590 states left on queue.
+Progress(47) at 2024-11-07 08:16:13: 29,326,133,934 states generated (28,674,952 s/min), 2,157,558,222 distinct states found (1,329,529 ds/min), 119,402,611 states left on queue.
+Progress(47) at 2024-11-07 08:17:13: 29,355,133,179 states generated (28,999,245 s/min), 2,159,036,229 distinct states found (1,478,007 ds/min), 119,059,305 states left on queue.
+Progress(47) at 2024-11-07 08:18:13: 29,384,094,216 states generated (28,961,037 s/min), 2,160,401,726 distinct states found (1,365,497 ds/min), 118,659,528 states left on queue.
+Progress(47) at 2024-11-07 08:19:13: 29,413,210,497 states generated (29,116,281 s/min), 2,162,252,062 distinct states found (1,850,336 ds/min), 118,605,990 states left on queue.
+Progress(47) at 2024-11-07 08:20:13: 29,442,123,726 states generated (28,913,229 s/min), 2,163,968,572 distinct states found (1,716,510 ds/min), 118,430,828 states left on queue.
+Progress(47) at 2024-11-07 08:21:13: 29,470,933,813 states generated (28,810,087 s/min), 2,165,411,802 distinct states found (1,443,230 ds/min), 118,017,068 states left on queue.
+Progress(47) at 2024-11-07 08:22:13: 29,499,968,878 states generated (29,035,065 s/min), 2,166,884,069 distinct states found (1,472,267 ds/min), 117,620,342 states left on queue.
+Progress(47) at 2024-11-07 08:23:13: 29,528,752,811 states generated (28,783,933 s/min), 2,168,252,577 distinct states found (1,368,508 ds/min), 117,101,560 states left on queue.
+Progress(47) at 2024-11-07 08:24:13: 29,557,568,598 states generated (28,815,787 s/min), 2,169,705,158 distinct states found (1,452,581 ds/min), 116,651,662 states left on queue.
+Progress(47) at 2024-11-07 08:25:13: 29,586,373,945 states generated (28,805,347 s/min), 2,171,138,563 distinct states found (1,433,405 ds/min), 116,184,414 states left on queue.
+Progress(47) at 2024-11-07 08:26:13: 29,614,983,668 states generated (28,609,723 s/min), 2,172,585,802 distinct states found (1,447,239 ds/min), 115,737,683 states left on queue.
+Progress(47) at 2024-11-07 08:27:13: 29,643,800,320 states generated (28,816,652 s/min), 2,174,078,381 distinct states found (1,492,579 ds/min), 115,326,021 states left on queue.
+Progress(47) at 2024-11-07 08:28:13: 29,672,907,645 states generated (29,107,325 s/min), 2,175,677,520 distinct states found (1,599,139 ds/min), 114,997,885 states left on queue.
+Progress(47) at 2024-11-07 08:29:13: 29,701,705,556 states generated (28,797,911 s/min), 2,177,190,856 distinct states found (1,513,336 ds/min), 114,628,087 states left on queue.
+Progress(47) at 2024-11-07 08:30:13: 29,730,412,995 states generated (28,707,439 s/min), 2,178,512,609 distinct states found (1,321,753 ds/min), 114,087,841 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 08:31:13)
+Progress(47) at 2024-11-07 08:31:13: 29,759,387,383 states generated (28,974,388 s/min), 2,180,040,685 distinct states found (1,528,076 ds/min), 113,747,306 states left on queue.
+Progress(47) at 2024-11-07 08:32:13: 29,788,001,065 states generated (28,613,682 s/min), 2,181,419,698 distinct states found (1,379,013 ds/min), 113,276,454 states left on queue.
+Progress(47) at 2024-11-07 08:33:13: 29,816,483,253 states generated (28,482,188 s/min), 2,182,794,509 distinct states found (1,374,811 ds/min), 112,764,181 states left on queue.
+Progress(47) at 2024-11-07 08:34:13: 29,845,032,133 states generated (28,548,880 s/min), 2,184,180,577 distinct states found (1,386,068 ds/min), 112,275,854 states left on queue.
+Progress(47) at 2024-11-07 08:35:13: 29,873,704,121 states generated (28,671,988 s/min), 2,185,610,616 distinct states found (1,430,039 ds/min), 111,765,886 states left on queue.
+Progress(47) at 2024-11-07 08:36:13: 29,901,983,007 states generated (28,278,886 s/min), 2,186,742,865 distinct states found (1,132,249 ds/min), 111,037,502 states left on queue.
+Progress(47) at 2024-11-07 08:37:13: 29,931,128,222 states generated (29,145,215 s/min), 2,188,247,053 distinct states found (1,504,188 ds/min), 110,610,871 states left on queue.
+Progress(47) at 2024-11-07 08:38:13: 29,960,291,600 states generated (29,163,378 s/min), 2,189,791,380 distinct states found (1,544,327 ds/min), 110,219,347 states left on queue.
+Progress(47) at 2024-11-07 08:39:13: 29,989,426,093 states generated (29,134,493 s/min), 2,191,523,686 distinct states found (1,732,306 ds/min), 109,988,090 states left on queue.
+Progress(47) at 2024-11-07 08:40:13: 30,018,419,613 states generated (28,993,520 s/min), 2,192,983,724 distinct states found (1,460,038 ds/min), 109,567,153 states left on queue.
+Progress(47) at 2024-11-07 08:41:13: 30,047,169,261 states generated (28,749,648 s/min), 2,194,485,325 distinct states found (1,501,601 ds/min), 109,159,610 states left on queue.
+Progress(47) at 2024-11-07 08:42:13: 30,076,320,011 states generated (29,150,750 s/min), 2,196,261,852 distinct states found (1,776,527 ds/min), 108,952,775 states left on queue.
+Progress(47) at 2024-11-07 08:43:13: 30,105,246,939 states generated (28,926,928 s/min), 2,197,745,917 distinct states found (1,484,065 ds/min), 108,533,801 states left on queue.
+Progress(47) at 2024-11-07 08:44:13: 30,134,017,722 states generated (28,770,783 s/min), 2,199,274,846 distinct states found (1,528,929 ds/min), 108,138,210 states left on queue.
+Progress(48) at 2024-11-07 08:45:13: 30,162,850,009 states generated (28,832,287 s/min), 2,200,818,695 distinct states found (1,543,849 ds/min), 107,749,686 states left on queue.
+Progress(48) at 2024-11-07 08:46:13: 30,191,763,541 states generated (28,913,532 s/min), 2,202,269,881 distinct states found (1,451,186 ds/min), 107,274,074 states left on queue.
+Progress(48) at 2024-11-07 08:47:13: 30,220,450,821 states generated (28,687,280 s/min), 2,203,579,369 distinct states found (1,309,488 ds/min), 106,693,506 states left on queue.
+Progress(48) at 2024-11-07 08:48:13: 30,249,109,647 states generated (28,658,826 s/min), 2,204,980,828 distinct states found (1,401,459 ds/min), 106,171,815 states left on queue.
+Progress(48) at 2024-11-07 08:49:13: 30,278,004,502 states generated (28,894,855 s/min), 2,206,546,641 distinct states found (1,565,813 ds/min), 105,800,017 states left on queue.
+Progress(48) at 2024-11-07 08:50:13: 30,307,176,628 states generated (29,172,126 s/min), 2,208,173,395 distinct states found (1,626,754 ds/min), 105,492,735 states left on queue.
+Progress(48) at 2024-11-07 08:51:13: 30,336,267,563 states generated (29,090,935 s/min), 2,209,629,083 distinct states found (1,455,688 ds/min), 105,046,561 states left on queue.
+Progress(48) at 2024-11-07 08:52:13: 30,365,237,300 states generated (28,969,737 s/min), 2,211,221,126 distinct states found (1,592,043 ds/min), 104,707,696 states left on queue.
+Progress(48) at 2024-11-07 08:53:13: 30,394,270,909 states generated (29,033,609 s/min), 2,212,948,437 distinct states found (1,727,311 ds/min), 104,490,104 states left on queue.
+Progress(48) at 2024-11-07 08:54:13: 30,423,140,115 states generated (28,869,206 s/min), 2,214,640,116 distinct states found (1,691,679 ds/min), 104,243,061 states left on queue.
+Progress(48) at 2024-11-07 08:55:13: 30,452,062,605 states generated (28,922,490 s/min), 2,216,327,939 distinct states found (1,687,823 ds/min), 103,983,745 states left on queue.
+Progress(48) at 2024-11-07 08:56:13: 30,481,071,056 states generated (29,008,451 s/min), 2,217,983,905 distinct states found (1,655,966 ds/min), 103,702,586 states left on queue.
+Progress(48) at 2024-11-07 08:57:13: 30,509,808,031 states generated (28,736,975 s/min), 2,219,662,593 distinct states found (1,678,688 ds/min), 103,423,522 states left on queue.
+Progress(48) at 2024-11-07 08:58:13: 30,538,616,862 states generated (28,808,831 s/min), 2,221,288,821 distinct states found (1,626,228 ds/min), 103,098,334 states left on queue.
+Progress(48) at 2024-11-07 08:59:13: 30,567,539,949 states generated (28,923,087 s/min), 2,222,969,669 distinct states found (1,680,848 ds/min), 102,811,145 states left on queue.
+Progress(48) at 2024-11-07 09:00:13: 30,596,220,572 states generated (28,680,623 s/min), 2,224,451,086 distinct states found (1,481,417 ds/min), 102,320,643 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 09:01:14)
+Progress(48) at 2024-11-07 09:01:14: 30,625,254,005 states generated (29,033,433 s/min), 2,225,971,213 distinct states found (1,520,127 ds/min), 101,895,678 states left on queue.
+Progress(48) at 2024-11-07 09:02:14: 30,654,316,875 states generated (29,062,870 s/min), 2,227,776,007 distinct states found (1,804,794 ds/min), 101,720,925 states left on queue.
+Progress(48) at 2024-11-07 09:03:14: 30,683,368,837 states generated (29,051,962 s/min), 2,229,520,592 distinct states found (1,744,585 ds/min), 101,516,049 states left on queue.
+Progress(48) at 2024-11-07 09:04:14: 30,712,221,770 states generated (28,852,933 s/min), 2,231,006,576 distinct states found (1,485,984 ds/min), 101,059,951 states left on queue.
+Progress(48) at 2024-11-07 09:05:14: 30,740,916,958 states generated (28,695,188 s/min), 2,232,634,565 distinct states found (1,627,989 ds/min), 100,742,863 states left on queue.
+Progress(48) at 2024-11-07 09:06:14: 30,769,477,527 states generated (28,560,569 s/min), 2,234,099,495 distinct states found (1,464,930 ds/min), 100,237,731 states left on queue.
+Progress(48) at 2024-11-07 09:07:14: 30,798,306,365 states generated (28,828,838 s/min), 2,235,757,510 distinct states found (1,658,015 ds/min), 99,936,798 states left on queue.
+Progress(48) at 2024-11-07 09:08:14: 30,827,145,014 states generated (28,838,649 s/min), 2,237,323,374 distinct states found (1,565,864 ds/min), 99,542,928 states left on queue.
+Progress(48) at 2024-11-07 09:09:14: 30,855,967,384 states generated (28,822,370 s/min), 2,238,712,445 distinct states found (1,389,071 ds/min), 98,994,892 states left on queue.
+Progress(48) at 2024-11-07 09:10:14: 30,884,757,904 states generated (28,790,520 s/min), 2,240,211,537 distinct states found (1,499,092 ds/min), 98,555,003 states left on queue.
+Progress(48) at 2024-11-07 09:11:14: 30,913,436,301 states generated (28,678,397 s/min), 2,241,549,402 distinct states found (1,337,865 ds/min), 97,972,368 states left on queue.
+Progress(48) at 2024-11-07 09:12:14: 30,942,398,628 states generated (28,962,327 s/min), 2,242,894,478 distinct states found (1,345,076 ds/min), 97,450,191 states left on queue.
+Progress(48) at 2024-11-07 09:13:14: 30,971,150,912 states generated (28,752,284 s/min), 2,244,149,533 distinct states found (1,255,055 ds/min), 96,915,440 states left on queue.
+Progress(48) at 2024-11-07 09:14:14: 31,000,226,695 states generated (29,075,783 s/min), 2,245,486,253 distinct states found (1,336,720 ds/min), 96,453,711 states left on queue.
+Progress(48) at 2024-11-07 09:15:14: 31,029,410,660 states generated (29,183,965 s/min), 2,247,033,348 distinct states found (1,547,095 ds/min), 96,134,910 states left on queue.
+Progress(48) at 2024-11-07 09:16:14: 31,058,657,395 states generated (29,246,735 s/min), 2,248,447,081 distinct states found (1,413,733 ds/min), 95,701,875 states left on queue.
+Progress(48) at 2024-11-07 09:17:14: 31,087,368,874 states generated (28,711,479 s/min), 2,249,703,997 distinct states found (1,256,916 ds/min), 95,112,797 states left on queue.
+Progress(48) at 2024-11-07 09:18:14: 31,115,905,907 states generated (28,537,033 s/min), 2,250,949,093 distinct states found (1,245,096 ds/min), 94,499,889 states left on queue.
+Progress(48) at 2024-11-07 09:19:14: 31,144,578,992 states generated (28,673,085 s/min), 2,252,226,995 distinct states found (1,277,902 ds/min), 93,927,098 states left on queue.
+Progress(48) at 2024-11-07 09:20:14: 31,173,557,966 states generated (28,978,974 s/min), 2,253,602,196 distinct states found (1,375,201 ds/min), 93,561,559 states left on queue.
+Progress(48) at 2024-11-07 09:21:14: 31,202,521,307 states generated (28,963,341 s/min), 2,255,224,149 distinct states found (1,621,953 ds/min), 93,337,000 states left on queue.
+Progress(48) at 2024-11-07 09:22:14: 31,231,451,884 states generated (28,930,577 s/min), 2,256,879,564 distinct states found (1,655,415 ds/min), 93,119,996 states left on queue.
+Progress(48) at 2024-11-07 09:23:14: 31,260,174,245 states generated (28,722,361 s/min), 2,258,206,514 distinct states found (1,326,950 ds/min), 92,610,216 states left on queue.
+Progress(48) at 2024-11-07 09:24:14: 31,289,091,475 states generated (28,917,230 s/min), 2,259,564,810 distinct states found (1,358,296 ds/min), 92,123,452 states left on queue.
+Progress(48) at 2024-11-07 09:25:14: 31,317,753,943 states generated (28,662,468 s/min), 2,260,868,559 distinct states found (1,303,749 ds/min), 91,550,997 states left on queue.
+Progress(48) at 2024-11-07 09:26:14: 31,346,435,672 states generated (28,681,729 s/min), 2,262,197,433 distinct states found (1,328,874 ds/min), 91,002,731 states left on queue.
+Progress(48) at 2024-11-07 09:27:14: 31,375,074,275 states generated (28,638,603 s/min), 2,263,549,308 distinct states found (1,351,875 ds/min), 90,479,028 states left on queue.
+Progress(48) at 2024-11-07 09:28:14: 31,403,896,903 states generated (28,822,628 s/min), 2,264,999,048 distinct states found (1,449,740 ds/min), 90,030,284 states left on queue.
+Progress(48) at 2024-11-07 09:29:14: 31,432,772,052 states generated (28,875,149 s/min), 2,266,431,878 distinct states found (1,432,830 ds/min), 89,580,165 states left on queue.
+Progress(48) at 2024-11-07 09:30:14: 31,461,382,905 states generated (28,610,853 s/min), 2,267,701,315 distinct states found (1,269,437 ds/min), 89,008,135 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 09:31:15)
+Progress(48) at 2024-11-07 09:31:15: 31,490,350,002 states generated (28,967,097 s/min), 2,269,120,991 distinct states found (1,419,676 ds/min), 88,574,899 states left on queue.
+Progress(48) at 2024-11-07 09:32:15: 31,518,738,286 states generated (28,388,284 s/min), 2,270,333,667 distinct states found (1,212,676 ds/min), 87,950,800 states left on queue.
+Progress(48) at 2024-11-07 09:33:15: 31,547,227,429 states generated (28,489,143 s/min), 2,271,632,491 distinct states found (1,298,824 ds/min), 87,379,110 states left on queue.
+Progress(48) at 2024-11-07 09:34:15: 31,575,696,846 states generated (28,469,417 s/min), 2,272,873,166 distinct states found (1,240,675 ds/min), 86,717,955 states left on queue.
+Progress(48) at 2024-11-07 09:35:15: 31,604,509,248 states generated (28,812,402 s/min), 2,274,166,128 distinct states found (1,292,962 ds/min), 86,122,414 states left on queue.
+Progress(48) at 2024-11-07 09:36:15: 31,633,623,894 states generated (29,114,646 s/min), 2,275,690,739 distinct states found (1,524,611 ds/min), 85,718,820 states left on queue.
+Progress(48) at 2024-11-07 09:37:15: 31,662,734,164 states generated (29,110,270 s/min), 2,277,282,041 distinct states found (1,591,302 ds/min), 85,389,121 states left on queue.
+Progress(48) at 2024-11-07 09:38:15: 31,691,488,753 states generated (28,754,589 s/min), 2,278,666,982 distinct states found (1,384,941 ds/min), 84,903,119 states left on queue.
+Progress(48) at 2024-11-07 09:39:15: 31,720,428,706 states generated (28,939,953 s/min), 2,280,231,311 distinct states found (1,564,329 ds/min), 84,529,794 states left on queue.
+Progress(48) at 2024-11-07 09:40:15: 31,749,336,886 states generated (28,908,180 s/min), 2,281,688,218 distinct states found (1,456,907 ds/min), 84,091,511 states left on queue.
+Progress(48) at 2024-11-07 09:41:15: 31,778,054,342 states generated (28,717,456 s/min), 2,283,102,693 distinct states found (1,414,475 ds/min), 83,605,316 states left on queue.
+Progress(49) at 2024-11-07 09:42:15: 31,806,874,604 states generated (28,820,262 s/min), 2,284,525,902 distinct states found (1,423,209 ds/min), 83,115,134 states left on queue.
+Progress(49) at 2024-11-07 09:43:15: 31,835,557,645 states generated (28,683,041 s/min), 2,285,776,893 distinct states found (1,250,991 ds/min), 82,491,419 states left on queue.
+Progress(49) at 2024-11-07 09:44:15: 31,864,075,450 states generated (28,517,805 s/min), 2,287,028,991 distinct states found (1,252,098 ds/min), 81,847,819 states left on queue.
+Progress(49) at 2024-11-07 09:45:15: 31,892,999,186 states generated (28,923,736 s/min), 2,288,552,140 distinct states found (1,523,149 ds/min), 81,459,937 states left on queue.
+Progress(49) at 2024-11-07 09:46:15: 31,922,276,996 states generated (29,277,810 s/min), 2,290,137,668 distinct states found (1,585,528 ds/min), 81,115,285 states left on queue.
+Progress(49) at 2024-11-07 09:47:15: 31,951,109,751 states generated (28,832,755 s/min), 2,291,477,001 distinct states found (1,339,333 ds/min), 80,582,606 states left on queue.
+Progress(49) at 2024-11-07 09:48:15: 31,980,103,122 states generated (28,993,371 s/min), 2,293,149,633 distinct states found (1,672,632 ds/min), 80,321,900 states left on queue.
+Progress(49) at 2024-11-07 09:49:15: 32,008,927,227 states generated (28,824,105 s/min), 2,294,737,299 distinct states found (1,587,666 ds/min), 79,988,982 states left on queue.
+Progress(49) at 2024-11-07 09:50:15: 32,037,912,405 states generated (28,985,178 s/min), 2,296,369,269 distinct states found (1,631,970 ds/min), 79,688,340 states left on queue.
+Progress(49) at 2024-11-07 09:51:15: 32,066,650,871 states generated (28,738,466 s/min), 2,297,881,682 distinct states found (1,512,413 ds/min), 79,285,058 states left on queue.
+Progress(49) at 2024-11-07 09:52:15: 32,095,474,869 states generated (28,823,998 s/min), 2,299,386,856 distinct states found (1,505,174 ds/min), 78,860,285 states left on queue.
+Progress(49) at 2024-11-07 09:53:15: 32,124,254,306 states generated (28,779,437 s/min), 2,300,974,245 distinct states found (1,587,389 ds/min), 78,501,509 states left on queue.
+Progress(49) at 2024-11-07 09:54:15: 32,152,874,934 states generated (28,620,628 s/min), 2,302,313,494 distinct states found (1,339,249 ds/min), 77,908,264 states left on queue.
+Progress(49) at 2024-11-07 09:55:15: 32,181,625,656 states generated (28,750,722 s/min), 2,303,719,911 distinct states found (1,406,417 ds/min), 77,409,147 states left on queue.
+Progress(49) at 2024-11-07 09:56:15: 32,210,690,682 states generated (29,065,026 s/min), 2,305,458,559 distinct states found (1,738,648 ds/min), 77,178,015 states left on queue.
+Progress(49) at 2024-11-07 09:57:15: 32,239,586,160 states generated (28,895,478 s/min), 2,307,003,156 distinct states found (1,544,597 ds/min), 76,805,818 states left on queue.
+Progress(49) at 2024-11-07 09:58:15: 32,268,327,819 states generated (28,741,659 s/min), 2,308,436,891 distinct states found (1,433,735 ds/min), 76,324,212 states left on queue.
+Progress(49) at 2024-11-07 09:59:15: 32,296,829,379 states generated (28,501,560 s/min), 2,309,831,948 distinct states found (1,395,057 ds/min), 75,779,735 states left on queue.
+Progress(49) at 2024-11-07 10:00:15: 32,325,628,397 states generated (28,799,018 s/min), 2,311,380,882 distinct states found (1,548,934 ds/min), 75,395,162 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 10:01:15)
+Progress(49) at 2024-11-07 10:01:15: 32,354,681,149 states generated (29,052,752 s/min), 2,312,867,979 distinct states found (1,487,097 ds/min), 74,928,503 states left on queue.
+Progress(49) at 2024-11-07 10:02:15: 32,383,406,034 states generated (28,724,885 s/min), 2,314,202,265 distinct states found (1,334,286 ds/min), 74,352,680 states left on queue.
+Progress(49) at 2024-11-07 10:03:15: 32,411,997,317 states generated (28,591,283 s/min), 2,315,435,082 distinct states found (1,232,817 ds/min), 73,700,708 states left on queue.
+Progress(49) at 2024-11-07 10:04:15: 32,440,769,297 states generated (28,771,980 s/min), 2,316,687,791 distinct states found (1,252,709 ds/min), 73,114,003 states left on queue.
+Progress(49) at 2024-11-07 10:05:15: 32,469,733,062 states generated (28,963,765 s/min), 2,317,885,762 distinct states found (1,197,971 ds/min), 72,558,372 states left on queue.
+Progress(49) at 2024-11-07 10:06:15: 32,498,863,740 states generated (29,130,678 s/min), 2,319,353,511 distinct states found (1,467,749 ds/min), 72,186,248 states left on queue.
+Progress(49) at 2024-11-07 10:07:15: 32,527,902,407 states generated (29,038,667 s/min), 2,320,635,445 distinct states found (1,281,934 ds/min), 71,639,893 states left on queue.
+Progress(49) at 2024-11-07 10:08:15: 32,556,361,400 states generated (28,458,993 s/min), 2,321,793,726 distinct states found (1,158,281 ds/min), 70,954,333 states left on queue.
+Progress(49) at 2024-11-07 10:09:15: 32,585,056,251 states generated (28,694,851 s/min), 2,323,009,155 distinct states found (1,215,429 ds/min), 70,362,671 states left on queue.
+Progress(49) at 2024-11-07 10:10:15: 32,613,972,815 states generated (28,916,564 s/min), 2,324,321,084 distinct states found (1,311,929 ds/min), 69,935,186 states left on queue.
+Progress(49) at 2024-11-07 10:11:15: 32,642,963,038 states generated (28,990,223 s/min), 2,325,997,874 distinct states found (1,676,790 ds/min), 69,730,871 states left on queue.
+Progress(49) at 2024-11-07 10:12:15: 32,671,642,762 states generated (28,679,724 s/min), 2,327,294,217 distinct states found (1,296,343 ds/min), 69,221,413 states left on queue.
+Progress(49) at 2024-11-07 10:13:15: 32,700,429,296 states generated (28,786,534 s/min), 2,328,535,742 distinct states found (1,241,525 ds/min), 68,635,066 states left on queue.
+Progress(49) at 2024-11-07 10:14:15: 32,729,076,182 states generated (28,646,886 s/min), 2,329,760,071 distinct states found (1,224,329 ds/min), 67,997,735 states left on queue.
+Progress(49) at 2024-11-07 10:15:15: 32,757,631,787 states generated (28,555,605 s/min), 2,331,002,517 distinct states found (1,242,446 ds/min), 67,379,374 states left on queue.
+Progress(49) at 2024-11-07 10:16:15: 32,786,472,553 states generated (28,840,766 s/min), 2,332,364,440 distinct states found (1,361,923 ds/min), 66,856,953 states left on queue.
+Progress(49) at 2024-11-07 10:17:15: 32,815,068,782 states generated (28,596,229 s/min), 2,333,629,799 distinct states found (1,265,359 ds/min), 66,266,973 states left on queue.
+Progress(49) at 2024-11-07 10:18:15: 32,843,671,035 states generated (28,602,253 s/min), 2,334,875,787 distinct states found (1,245,988 ds/min), 65,714,901 states left on queue.
+Progress(49) at 2024-11-07 10:19:15: 32,872,127,728 states generated (28,456,693 s/min), 2,336,030,334 distinct states found (1,154,547 ds/min), 65,023,805 states left on queue.
+Progress(49) at 2024-11-07 10:20:15: 32,900,582,167 states generated (28,454,439 s/min), 2,337,180,611 distinct states found (1,150,277 ds/min), 64,304,348 states left on queue.
+Progress(49) at 2024-11-07 10:21:15: 32,929,545,972 states generated (28,963,805 s/min), 2,338,488,833 distinct states found (1,308,222 ds/min), 63,715,470 states left on queue.
+Progress(49) at 2024-11-07 10:22:15: 32,958,603,673 states generated (29,057,701 s/min), 2,339,992,330 distinct states found (1,503,497 ds/min), 63,307,968 states left on queue.
+Progress(49) at 2024-11-07 10:23:15: 32,987,442,078 states generated (28,838,405 s/min), 2,341,335,966 distinct states found (1,343,636 ds/min), 62,792,292 states left on queue.
+Progress(49) at 2024-11-07 10:24:15: 33,016,381,018 states generated (28,938,940 s/min), 2,342,828,482 distinct states found (1,492,516 ds/min), 62,365,394 states left on queue.
+Progress(49) at 2024-11-07 10:25:15: 33,045,061,128 states generated (28,680,110 s/min), 2,344,118,515 distinct states found (1,290,033 ds/min), 61,789,542 states left on queue.
+Progress(49) at 2024-11-07 10:26:15: 33,073,888,592 states generated (28,827,464 s/min), 2,345,475,829 distinct states found (1,357,314 ds/min), 61,253,128 states left on queue.
+Progress(50) at 2024-11-07 10:27:15: 33,102,491,050 states generated (28,602,458 s/min), 2,346,652,625 distinct states found (1,176,796 ds/min), 60,570,177 states left on queue.
+Progress(50) at 2024-11-07 10:28:15: 33,131,166,035 states generated (28,674,985 s/min), 2,347,941,873 distinct states found (1,289,248 ds/min), 59,969,815 states left on queue.
+Progress(50) at 2024-11-07 10:29:15: 33,160,270,838 states generated (29,104,803 s/min), 2,349,441,004 distinct states found (1,499,131 ds/min), 59,570,847 states left on queue.
+Progress(50) at 2024-11-07 10:30:15: 33,189,149,869 states generated (28,879,031 s/min), 2,350,812,706 distinct states found (1,371,702 ds/min), 59,068,202 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 10:31:16)
+Progress(50) at 2024-11-07 10:31:16: 33,218,286,121 states generated (29,136,252 s/min), 2,352,357,375 distinct states found (1,544,669 ds/min), 58,692,343 states left on queue.
+Progress(50) at 2024-11-07 10:32:16: 33,246,927,616 states generated (28,641,495 s/min), 2,353,796,993 distinct states found (1,439,618 ds/min), 58,245,674 states left on queue.
+Progress(50) at 2024-11-07 10:33:16: 33,275,692,609 states generated (28,764,993 s/min), 2,355,282,278 distinct states found (1,485,285 ds/min), 57,825,713 states left on queue.
+Progress(50) at 2024-11-07 10:34:16: 33,304,267,545 states generated (28,574,936 s/min), 2,356,681,270 distinct states found (1,398,992 ds/min), 57,325,849 states left on queue.
+Progress(50) at 2024-11-07 10:35:16: 33,332,888,163 states generated (28,620,618 s/min), 2,358,099,683 distinct states found (1,418,413 ds/min), 56,833,993 states left on queue.
+Progress(50) at 2024-11-07 10:36:16: 33,361,236,042 states generated (28,347,879 s/min), 2,359,281,358 distinct states found (1,181,675 ds/min), 56,126,890 states left on queue.
+Progress(50) at 2024-11-07 10:37:16: 33,390,140,655 states generated (28,904,613 s/min), 2,360,868,517 distinct states found (1,587,159 ds/min), 55,791,859 states left on queue.
+Progress(50) at 2024-11-07 10:38:16: 33,418,998,816 states generated (28,858,161 s/min), 2,362,363,780 distinct states found (1,495,263 ds/min), 55,385,255 states left on queue.
+Progress(50) at 2024-11-07 10:39:16: 33,447,612,810 states generated (28,613,994 s/min), 2,363,728,858 distinct states found (1,365,078 ds/min), 54,854,942 states left on queue.
+Progress(50) at 2024-11-07 10:40:16: 33,476,162,070 states generated (28,549,260 s/min), 2,365,099,267 distinct states found (1,370,409 ds/min), 54,312,039 states left on queue.
+Progress(50) at 2024-11-07 10:41:16: 33,504,811,505 states generated (28,649,435 s/min), 2,366,473,549 distinct states found (1,374,282 ds/min), 53,784,809 states left on queue.
+Progress(50) at 2024-11-07 10:42:16: 33,533,403,252 states generated (28,591,747 s/min), 2,367,734,253 distinct states found (1,260,704 ds/min), 53,158,819 states left on queue.
+Progress(50) at 2024-11-07 10:43:16: 33,561,952,889 states generated (28,549,637 s/min), 2,368,855,124 distinct states found (1,120,871 ds/min), 52,441,471 states left on queue.
+Progress(50) at 2024-11-07 10:44:16: 33,590,825,690 states generated (28,872,801 s/min), 2,370,054,403 distinct states found (1,199,279 ds/min), 51,878,202 states left on queue.
+Progress(50) at 2024-11-07 10:45:16: 33,619,895,477 states generated (29,069,787 s/min), 2,371,355,035 distinct states found (1,300,632 ds/min), 51,382,836 states left on queue.
+Progress(50) at 2024-11-07 10:46:16: 33,648,391,719 states generated (28,496,242 s/min), 2,372,441,699 distinct states found (1,086,664 ds/min), 50,647,071 states left on queue.
+Progress(50) at 2024-11-07 10:47:16: 33,677,074,147 states generated (28,682,428 s/min), 2,373,600,507 distinct states found (1,158,808 ds/min), 50,052,421 states left on queue.
+Progress(50) at 2024-11-07 10:48:16: 33,705,980,713 states generated (28,906,566 s/min), 2,375,050,402 distinct states found (1,449,895 ds/min), 49,692,912 states left on queue.
+Progress(50) at 2024-11-07 10:49:16: 33,734,700,309 states generated (28,719,596 s/min), 2,376,355,805 distinct states found (1,305,403 ds/min), 49,202,990 states left on queue.
+Progress(50) at 2024-11-07 10:50:16: 33,763,294,505 states generated (28,594,196 s/min), 2,377,489,014 distinct states found (1,133,209 ds/min), 48,526,991 states left on queue.
+Progress(50) at 2024-11-07 10:51:16: 33,791,781,835 states generated (28,487,330 s/min), 2,378,610,114 distinct states found (1,121,100 ds/min), 47,806,234 states left on queue.
+Progress(50) at 2024-11-07 10:52:16: 33,820,496,936 states generated (28,715,101 s/min), 2,379,861,294 distinct states found (1,251,180 ds/min), 47,194,112 states left on queue.
+Progress(50) at 2024-11-07 10:53:16: 33,848,955,580 states generated (28,458,644 s/min), 2,381,018,247 distinct states found (1,156,953 ds/min), 46,544,595 states left on queue.
+Progress(50) at 2024-11-07 10:54:16: 33,877,358,985 states generated (28,403,405 s/min), 2,382,084,162 distinct states found (1,065,915 ds/min), 45,797,353 states left on queue.
+Progress(50) at 2024-11-07 10:55:16: 33,905,938,026 states generated (28,579,041 s/min), 2,383,237,725 distinct states found (1,153,563 ds/min), 45,079,182 states left on queue.
+Progress(50) at 2024-11-07 10:56:16: 33,934,925,952 states generated (28,987,926 s/min), 2,384,648,770 distinct states found (1,411,045 ds/min), 44,602,865 states left on queue.
+Progress(50) at 2024-11-07 10:57:16: 33,963,625,658 states generated (28,699,706 s/min), 2,385,892,826 distinct states found (1,244,056 ds/min), 44,000,281 states left on queue.
+Progress(50) at 2024-11-07 10:58:16: 33,992,548,128 states generated (28,922,470 s/min), 2,387,290,030 distinct states found (1,397,204 ds/min), 43,514,140 states left on queue.
+Progress(51) at 2024-11-07 10:59:16: 34,021,202,960 states generated (28,654,832 s/min), 2,388,511,227 distinct states found (1,221,197 ds/min), 42,867,785 states left on queue.
+Progress(51) at 2024-11-07 11:00:16: 34,049,640,853 states generated (28,437,893 s/min), 2,389,565,989 distinct states found (1,054,762 ds/min), 42,084,713 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 11:01:17)
+Progress(51) at 2024-11-07 11:01:17: 34,079,102,421 states generated (29,461,568 s/min), 2,391,039,395 distinct states found (1,473,406 ds/min), 41,644,463 states left on queue.
+Progress(51) at 2024-11-07 11:02:17: 34,107,932,294 states generated (28,829,873 s/min), 2,392,415,920 distinct states found (1,376,525 ds/min), 41,153,999 states left on queue.
+Progress(51) at 2024-11-07 11:03:17: 34,136,619,823 states generated (28,687,529 s/min), 2,393,784,341 distinct states found (1,368,421 ds/min), 40,648,398 states left on queue.
+Progress(51) at 2024-11-07 11:04:17: 34,165,416,573 states generated (28,796,750 s/min), 2,395,186,568 distinct states found (1,402,227 ds/min), 40,162,223 states left on queue.
+Progress(51) at 2024-11-07 11:05:17: 34,193,934,145 states generated (28,517,572 s/min), 2,396,461,207 distinct states found (1,274,639 ds/min), 39,558,749 states left on queue.
+Progress(51) at 2024-11-07 11:06:17: 34,222,437,146 states generated (28,503,001 s/min), 2,397,667,005 distinct states found (1,205,798 ds/min), 38,877,170 states left on queue.
+Progress(51) at 2024-11-07 11:07:17: 34,251,162,633 states generated (28,725,487 s/min), 2,399,047,586 distinct states found (1,380,581 ds/min), 38,366,536 states left on queue.
+Progress(51) at 2024-11-07 11:08:17: 34,280,005,309 states generated (28,842,676 s/min), 2,400,476,715 distinct states found (1,429,129 ds/min), 37,912,093 states left on queue.
+Progress(51) at 2024-11-07 11:09:17: 34,308,388,681 states generated (28,383,372 s/min), 2,401,648,509 distinct states found (1,171,794 ds/min), 37,215,479 states left on queue.
+Progress(51) at 2024-11-07 11:10:17: 34,337,086,557 states generated (28,697,876 s/min), 2,403,035,913 distinct states found (1,387,404 ds/min), 36,712,331 states left on queue.
+Progress(51) at 2024-11-07 11:11:17: 34,365,565,315 states generated (28,478,758 s/min), 2,404,187,792 distinct states found (1,151,879 ds/min), 36,008,223 states left on queue.
+Progress(51) at 2024-11-07 11:12:17: 34,394,280,845 states generated (28,715,530 s/min), 2,405,264,161 distinct states found (1,076,369 ds/min), 35,318,651 states left on queue.
+Progress(51) at 2024-11-07 11:13:17: 34,423,292,173 states generated (29,011,328 s/min), 2,406,461,030 distinct states found (1,196,869 ds/min), 34,731,310 states left on queue.
+Progress(51) at 2024-11-07 11:14:17: 34,451,717,631 states generated (28,425,458 s/min), 2,407,470,263 distinct states found (1,009,233 ds/min), 33,977,845 states left on queue.
+Progress(51) at 2024-11-07 11:15:17: 34,480,582,848 states generated (28,865,217 s/min), 2,408,844,472 distinct states found (1,374,209 ds/min), 33,563,385 states left on queue.
+Progress(51) at 2024-11-07 11:16:17: 34,509,255,375 states generated (28,672,527 s/min), 2,409,992,223 distinct states found (1,147,751 ds/min), 32,948,371 states left on queue.
+Progress(51) at 2024-11-07 11:17:17: 34,537,627,156 states generated (28,371,781 s/min), 2,411,007,744 distinct states found (1,015,521 ds/min), 32,138,450 states left on queue.
+Progress(51) at 2024-11-07 11:18:17: 34,566,104,650 states generated (28,477,494 s/min), 2,412,094,834 distinct states found (1,087,090 ds/min), 31,405,790 states left on queue.
+Progress(51) at 2024-11-07 11:19:17: 34,594,468,421 states generated (28,363,771 s/min), 2,413,136,514 distinct states found (1,041,680 ds/min), 30,631,648 states left on queue.
+Progress(51) at 2024-11-07 11:20:17: 34,623,282,746 states generated (28,814,325 s/min), 2,414,376,756 distinct states found (1,240,242 ds/min), 30,011,457 states left on queue.
+Progress(51) at 2024-11-07 11:21:17: 34,652,013,328 states generated (28,730,582 s/min), 2,415,631,977 distinct states found (1,255,221 ds/min), 29,420,035 states left on queue.
+Progress(51) at 2024-11-07 11:22:17: 34,680,708,001 states generated (28,694,673 s/min), 2,416,841,149 distinct states found (1,209,172 ds/min), 28,780,239 states left on queue.
+Progress(52) at 2024-11-07 11:23:17: 34,709,197,697 states generated (28,489,696 s/min), 2,417,931,157 distinct states found (1,090,008 ds/min), 28,033,256 states left on queue.
+Progress(52) at 2024-11-07 11:24:17: 34,738,057,742 states generated (28,860,045 s/min), 2,419,214,866 distinct states found (1,283,709 ds/min), 27,476,210 states left on queue.
+Progress(52) at 2024-11-07 11:25:17: 34,766,795,719 states generated (28,737,977 s/min), 2,420,575,203 distinct states found (1,360,337 ds/min), 26,973,510 states left on queue.
+Progress(52) at 2024-11-07 11:26:17: 34,795,409,801 states generated (28,614,082 s/min), 2,421,852,170 distinct states found (1,276,967 ds/min), 26,383,152 states left on queue.
+Progress(52) at 2024-11-07 11:27:17: 34,823,871,413 states generated (28,461,612 s/min), 2,423,018,118 distinct states found (1,165,948 ds/min), 25,687,358 states left on queue.
+Progress(52) at 2024-11-07 11:28:17: 34,852,452,267 states generated (28,580,854 s/min), 2,424,258,491 distinct states found (1,240,373 ds/min), 25,061,677 states left on queue.
+Progress(52) at 2024-11-07 11:29:17: 34,881,109,110 states generated (28,656,843 s/min), 2,425,536,450 distinct states found (1,277,959 ds/min), 24,485,682 states left on queue.
+Progress(52) at 2024-11-07 11:30:17: 34,909,638,357 states generated (28,529,247 s/min), 2,426,766,241 distinct states found (1,229,791 ds/min), 23,851,800 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 11:31:18)
+Progress(52) at 2024-11-07 11:31:18: 34,938,217,205 states generated (28,578,848 s/min), 2,427,804,784 distinct states found (1,038,543 ds/min), 23,061,400 states left on queue.
+Progress(52) at 2024-11-07 11:32:18: 34,967,089,391 states generated (28,872,186 s/min), 2,428,907,251 distinct states found (1,102,467 ds/min), 22,421,037 states left on queue.
+Progress(52) at 2024-11-07 11:33:18: 34,995,531,710 states generated (28,442,319 s/min), 2,429,963,142 distinct states found (1,055,891 ds/min), 21,740,235 states left on queue.
+Progress(52) at 2024-11-07 11:34:18: 35,024,141,172 states generated (28,609,462 s/min), 2,431,122,150 distinct states found (1,159,008 ds/min), 21,149,288 states left on queue.
+Progress(52) at 2024-11-07 11:35:18: 35,052,351,960 states generated (28,210,788 s/min), 2,432,077,858 distinct states found (955,708 ds/min), 20,295,072 states left on queue.
+Progress(52) at 2024-11-07 11:36:18: 35,080,654,028 states generated (28,302,068 s/min), 2,433,061,991 distinct states found (984,133 ds/min), 19,478,746 states left on queue.
+Progress(52) at 2024-11-07 11:37:18: 35,109,293,099 states generated (28,639,071 s/min), 2,434,258,110 distinct states found (1,196,119 ds/min), 18,850,062 states left on queue.
+Progress(53) at 2024-11-07 11:38:18: 35,137,874,307 states generated (28,581,208 s/min), 2,435,408,538 distinct states found (1,150,428 ds/min), 18,171,042 states left on queue.
+Progress(53) at 2024-11-07 11:39:18: 35,166,493,712 states generated (28,619,405 s/min), 2,436,567,034 distinct states found (1,158,496 ds/min), 17,510,811 states left on queue.
+Progress(53) at 2024-11-07 11:40:18: 35,195,076,188 states generated (28,582,476 s/min), 2,437,810,887 distinct states found (1,243,853 ds/min), 16,916,098 states left on queue.
+Progress(53) at 2024-11-07 11:41:18: 35,223,492,769 states generated (28,416,581 s/min), 2,438,939,934 distinct states found (1,129,047 ds/min), 16,200,301 states left on queue.
+Progress(53) at 2024-11-07 11:42:18: 35,252,026,035 states generated (28,533,266 s/min), 2,440,130,151 distinct states found (1,190,217 ds/min), 15,545,447 states left on queue.
+Progress(53) at 2024-11-07 11:43:18: 35,280,482,465 states generated (28,456,430 s/min), 2,441,297,027 distinct states found (1,166,876 ds/min), 14,879,990 states left on queue.
+Progress(53) at 2024-11-07 11:44:18: 35,308,940,796 states generated (28,458,331 s/min), 2,442,317,453 distinct states found (1,020,426 ds/min), 14,116,803 states left on queue.
+Progress(53) at 2024-11-07 11:45:18: 35,337,597,306 states generated (28,656,510 s/min), 2,443,328,791 distinct states found (1,011,338 ds/min), 13,403,307 states left on queue.
+Progress(53) at 2024-11-07 11:46:18: 35,366,058,165 states generated (28,460,859 s/min), 2,444,336,498 distinct states found (1,007,707 ds/min), 12,657,418 states left on queue.
+Progress(53) at 2024-11-07 11:47:18: 35,394,499,327 states generated (28,441,162 s/min), 2,445,346,072 distinct states found (1,009,574 ds/min), 11,856,670 states left on queue.
+Progress(53) at 2024-11-07 11:48:18: 35,423,058,448 states generated (28,559,121 s/min), 2,446,449,527 distinct states found (1,103,455 ds/min), 11,150,850 states left on queue.
+Progress(54) at 2024-11-07 11:49:18: 35,451,714,950 states generated (28,656,502 s/min), 2,447,608,246 distinct states found (1,158,719 ds/min), 10,497,489 states left on queue.
+Progress(54) at 2024-11-07 11:50:18: 35,480,075,027 states generated (28,360,077 s/min), 2,448,668,413 distinct states found (1,060,167 ds/min), 9,734,924 states left on queue.
+Progress(54) at 2024-11-07 11:51:18: 35,508,544,241 states generated (28,469,214 s/min), 2,449,793,995 distinct states found (1,125,582 ds/min), 9,041,108 states left on queue.
+Progress(54) at 2024-11-07 11:52:18: 35,537,058,894 states generated (28,514,653 s/min), 2,450,835,560 distinct states found (1,041,565 ds/min), 8,304,357 states left on queue.
+Progress(54) at 2024-11-07 11:53:18: 35,565,617,770 states generated (28,558,876 s/min), 2,451,805,307 distinct states found (969,747 ds/min), 7,554,593 states left on queue.
+Progress(54) at 2024-11-07 11:54:18: 35,594,096,319 states generated (28,478,549 s/min), 2,452,829,286 distinct states found (1,023,979 ds/min), 6,777,854 states left on queue.
+Progress(55) at 2024-11-07 11:55:18: 35,622,658,049 states generated (28,561,730 s/min), 2,453,911,213 distinct states found (1,081,927 ds/min), 6,063,348 states left on queue.
+Progress(55) at 2024-11-07 11:56:18: 35,651,019,108 states generated (28,361,059 s/min), 2,454,944,844 distinct states found (1,033,631 ds/min), 5,290,297 states left on queue.
+Progress(55) at 2024-11-07 11:57:18: 35,679,577,103 states generated (28,557,995 s/min), 2,455,941,484 distinct states found (996,640 ds/min), 4,540,257 states left on queue.
+Progress(55) at 2024-11-07 11:58:18: 35,708,050,230 states generated (28,473,127 s/min), 2,456,911,566 distinct states found (970,082 ds/min), 3,737,722 states left on queue.
+Progress(55) at 2024-11-07 11:59:18: 35,736,484,911 states generated (28,434,681 s/min), 2,457,942,176 distinct states found (1,030,610 ds/min), 2,980,348 states left on queue.
+Progress(56) at 2024-11-07 12:00:18: 35,765,029,620 states generated (28,544,709 s/min), 2,458,911,346 distinct states found (969,170 ds/min), 2,201,353 states left on queue.
+Checkpointing of run states/24-11-06-15-30-45.354
+Checkpointing completed at (2024-11-07 12:01:18)
+Progress(57) at 2024-11-07 12:01:18: 35,793,733,161 states generated (28,703,541 s/min), 2,459,897,228 distinct states found (985,882 ds/min), 1,411,705 states left on queue.
+Progress(58) at 2024-11-07 12:02:18: 35,822,110,432 states generated (28,377,271 s/min), 2,460,820,961 distinct states found (923,733 ds/min), 587,430 states left on queue.
+Model checking completed. No error has been found.
+  Estimates of the probability that TLC did not check all reachable states
+  because two distinct states had the same fingerprint:
+  calculated (optimistic):  val = 4.5
+  based on the actual fingerprints:  val = .25
+35840434685 states generated, 2461362509 distinct states found, 0 states left on queue.
+The depth of the complete state graph search is 67.
+The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 8 and the 95th percentile is 2).
+Finished in 20h 32min at (2024-11-07 12:03:02)
diff --git a/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a5_t2_l2.cfg-2024-11-06--12-09-32.log b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a5_t2_l2.cfg-2024-11-06--12-09-32.log
new file mode 100644
index 000000000000..c43d52302b3c
--- /dev/null
+++ b/safekeeper/spec/tlc-results/MCProposerAcceptorStatic.tla-MCProposerAcceptorStatic_p2_a5_t2_l2.cfg-2024-11-06--12-09-32.log
@@ -0,0 +1,89 @@
+git revision: 864f4667d
+Platform: Linux neon-dev-arm64-1 6.8.0-48-generic #48-Ubuntu SMP PREEMPT_DYNAMIC Fri Sep 27 14:35:45 UTC 2024 aarch64 aarch64 aarch64 GNU/Linux
+CPU Info Linux: Neoverse-N1
+CPU Cores Linux: 80
+CPU Info Mac: 
+CPU Cores Mac: 
+Spec: MCProposerAcceptorStatic.tla
+Config: models/MCProposerAcceptorStatic_p2_a5_t2_l2.cfg
+----
+CONSTANTS
+NULL = NULL
+proposers = {p1, p2}
+acceptors = {a1, a2, a3, a4, a5}
+max_term = 2
+max_entries = 2
+SPECIFICATION Spec
+CONSTRAINT StateConstraint
+INVARIANT
+TypeOk
+ElectionSafety
+LogIsMonotonic
+LogSafety
+SYMMETRY ProposerAcceptorSymmetry
+CHECK_DEADLOCK FALSE
+ALIAS Alias
+
+----
+
+TLC2 Version 2.20 of Day Month 20?? (rev: f68cb71)
+Running breadth-first search Model-Checking with fp 90 and seed 2164066158568118414 with 80 workers on 80 cores with 54613MB heap and 61440MB offheap memory [pid: 30788] (Linux 6.8.0-48-generic aarch64, Ubuntu 21.0.4 x86_64, OffHeapDiskFPSet, DiskStateQueue).
+Parsing file /home/arseny/neon/safekeeper/spec/MCProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-13824636513165485309/TLC.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLC.tla)
+Parsing file /home/arseny/neon/safekeeper/spec/ProposerAcceptorStatic.tla
+Parsing file /tmp/tlc-13824636513165485309/_TLCTrace.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/_TLCTrace.tla)
+Parsing file /tmp/tlc-13824636513165485309/Integers.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Integers.tla)
+Parsing file /tmp/tlc-13824636513165485309/Sequences.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Sequences.tla)
+Parsing file /tmp/tlc-13824636513165485309/FiniteSets.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/FiniteSets.tla)
+Parsing file /tmp/tlc-13824636513165485309/Naturals.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/Naturals.tla)
+Parsing file /tmp/tlc-13824636513165485309/TLCExt.tla (jar:file:/home/arseny/tla2tools.jar!/tla2sany/StandardModules/TLCExt.tla)
+Semantic processing of module Naturals
+Semantic processing of module Sequences
+Semantic processing of module FiniteSets
+Semantic processing of module TLC
+Semantic processing of module Integers
+Semantic processing of module ProposerAcceptorStatic
+Semantic processing of module TLCExt
+Semantic processing of module _TLCTrace
+Semantic processing of module MCProposerAcceptorStatic
+Starting... (2024-11-06 12:09:33)
+Computing initial states...
+Finished computing initial states: 1 distinct state generated at 2024-11-06 12:09:36.
+Progress(16) at 2024-11-06 12:09:39: 405,675 states generated (405,675 s/min), 18,042 distinct states found (18,042 ds/min), 7,612 states left on queue.
+Progress(23) at 2024-11-06 12:10:39: 12,449,257 states generated (12,043,582 s/min), 467,293 distinct states found (449,251 ds/min), 161,057 states left on queue.
+Progress(25) at 2024-11-06 12:11:39: 24,461,332 states generated (12,012,075 s/min), 861,011 distinct states found (393,718 ds/min), 267,072 states left on queue.
+Progress(26) at 2024-11-06 12:12:39: 36,440,377 states generated (11,979,045 s/min), 1,234,052 distinct states found (373,041 ds/min), 355,372 states left on queue.
+Progress(26) at 2024-11-06 12:13:39: 48,327,873 states generated (11,887,496 s/min), 1,583,736 distinct states found (349,684 ds/min), 425,209 states left on queue.
+Progress(27) at 2024-11-06 12:14:39: 60,246,136 states generated (11,918,263 s/min), 1,933,499 distinct states found (349,763 ds/min), 494,269 states left on queue.
+Progress(28) at 2024-11-06 12:15:39: 71,977,716 states generated (11,731,580 s/min), 2,265,302 distinct states found (331,803 ds/min), 553,777 states left on queue.
+Progress(28) at 2024-11-06 12:16:39: 83,644,537 states generated (11,666,821 s/min), 2,575,451 distinct states found (310,149 ds/min), 594,142 states left on queue.
+Progress(29) at 2024-11-06 12:17:39: 95,287,089 states generated (11,642,552 s/min), 2,888,793 distinct states found (313,342 ds/min), 639,273 states left on queue.
+Progress(29) at 2024-11-06 12:18:39: 107,000,972 states generated (11,713,883 s/min), 3,194,255 distinct states found (305,462 ds/min), 673,353 states left on queue.
+Progress(29) at 2024-11-06 12:19:39: 118,305,248 states generated (11,304,276 s/min), 3,467,775 distinct states found (273,520 ds/min), 692,915 states left on queue.
+Progress(29) at 2024-11-06 12:20:39: 129,954,327 states generated (11,649,079 s/min), 3,763,186 distinct states found (295,411 ds/min), 720,349 states left on queue.
+Progress(29) at 2024-11-06 12:21:39: 141,251,359 states generated (11,297,032 s/min), 4,020,407 distinct states found (257,221 ds/min), 724,036 states left on queue.
+Progress(30) at 2024-11-06 12:22:39: 152,551,873 states generated (11,300,514 s/min), 4,284,278 distinct states found (263,871 ds/min), 733,726 states left on queue.
+Progress(30) at 2024-11-06 12:23:39: 164,324,788 states generated (11,772,915 s/min), 4,569,569 distinct states found (285,291 ds/min), 746,476 states left on queue.
+Progress(30) at 2024-11-06 12:24:39: 175,121,317 states generated (10,796,529 s/min), 4,779,505 distinct states found (209,936 ds/min), 723,070 states left on queue.
+Progress(31) at 2024-11-06 12:25:39: 186,238,236 states generated (11,116,919 s/min), 5,016,034 distinct states found (236,529 ds/min), 712,944 states left on queue.
+Progress(31) at 2024-11-06 12:26:39: 197,884,578 states generated (11,646,342 s/min), 5,276,094 distinct states found (260,060 ds/min), 705,471 states left on queue.
+Progress(31) at 2024-11-06 12:27:39: 208,535,096 states generated (10,650,518 s/min), 5,463,450 distinct states found (187,356 ds/min), 665,661 states left on queue.
+Progress(32) at 2024-11-06 12:28:39: 219,424,829 states generated (10,889,733 s/min), 5,673,673 distinct states found (210,223 ds/min), 637,975 states left on queue.
+Progress(32) at 2024-11-06 12:29:39: 230,906,372 states generated (11,481,543 s/min), 5,903,516 distinct states found (229,843 ds/min), 606,255 states left on queue.
+Progress(33) at 2024-11-06 12:30:39: 241,261,887 states generated (10,355,515 s/min), 6,065,731 distinct states found (162,215 ds/min), 552,728 states left on queue.
+Progress(33) at 2024-11-06 12:31:39: 252,028,921 states generated (10,767,034 s/min), 6,255,487 distinct states found (189,756 ds/min), 509,620 states left on queue.
+Progress(33) at 2024-11-06 12:32:39: 262,856,171 states generated (10,827,250 s/min), 6,431,063 distinct states found (175,576 ds/min), 448,834 states left on queue.
+Progress(34) at 2024-11-06 12:33:39: 273,211,882 states generated (10,355,711 s/min), 6,586,644 distinct states found (155,581 ds/min), 386,905 states left on queue.
+Progress(34) at 2024-11-06 12:34:39: 283,843,415 states generated (10,631,533 s/min), 6,743,916 distinct states found (157,272 ds/min), 315,135 states left on queue.
+Progress(35) at 2024-11-06 12:35:39: 293,931,115 states generated (10,087,700 s/min), 6,878,405 distinct states found (134,489 ds/min), 241,126 states left on queue.
+Progress(36) at 2024-11-06 12:36:39: 303,903,441 states generated (9,972,326 s/min), 6,996,394 distinct states found (117,989 ds/min), 152,775 states left on queue.
+Progress(37) at 2024-11-06 12:37:39: 313,501,886 states generated (9,598,445 s/min), 7,093,031 distinct states found (96,637 ds/min), 54,009 states left on queue.
+Model checking completed. No error has been found.
+  Estimates of the probability that TLC did not check all reachable states
+  because two distinct states had the same fingerprint:
+  calculated (optimistic):  val = 1.2E-4
+  based on the actual fingerprints:  val = 2.1E-6
+318172398 states generated, 7127950 distinct states found, 0 states left on queue.
+The depth of the complete state graph search is 44.
+The average outdegree of the complete state graph is 1 (minimum is 0, the maximum 9 and the 95th percentile is 3).
+Finished in 28min 43s at (2024-11-06 12:38:16)

From 243bca1c49ec93444050412e460caa4659969d9c Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Mon, 2 Dec 2024 18:24:48 +0100
Subject: [PATCH 08/65] Bump OTel, tracing, reqwest crates (#9970)

---
 Cargo.lock | 158 ++++++++++++++++++++++++++---------------------------
 Cargo.toml |  16 +++---
 2 files changed, 86 insertions(+), 88 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5ce27a7d45e7..ba02e3b11d40 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -185,7 +185,7 @@ checksum = "965c2d33e53cb6b267e148a4cb0760bc01f4904c1cd4bb4002a085bb016d1490"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
  "synstructure",
 ]
 
@@ -197,7 +197,7 @@ checksum = "7b18050c2cd6fe86c3a76584ef5e0baf286d038cda203eb6223df2cc413565f7"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -256,7 +256,7 @@ checksum = "16e62a023e7c117e27523144c5d2459f4397fcc3cab0085af8e2224f643a0193"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -267,7 +267,7 @@ checksum = "b9ccdd8f2a161be9bd5c023df56f1b2a0bd1d83872ae53b71a84a12c9bf6e842"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -969,7 +969,7 @@ dependencies = [
  "regex",
  "rustc-hash",
  "shlex",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1198,7 +1198,7 @@ dependencies = [
  "heck 0.4.1",
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1615,7 +1615,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "strsim",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1626,7 +1626,7 @@ checksum = "29a358ff9f12ec09c3e61fef9b5a9902623a695a46a917b07f269bff1445611a"
 dependencies = [
  "darling_core",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1749,7 +1749,7 @@ dependencies = [
  "dsl_auto_type",
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1769,7 +1769,7 @@ version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "209c735641a413bc68c4923a9d6ad4bcb3ca306b794edaa7eb0b3228a99ffb25"
 dependencies = [
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1792,7 +1792,7 @@ checksum = "487585f4d0c6655fe74905e2504d8ad6908e4db67f744eb140876906c2f3175d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1815,7 +1815,7 @@ dependencies = [
  "heck 0.5.0",
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1947,7 +1947,7 @@ dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -1980,7 +1980,7 @@ checksum = "3bf679796c0322556351f287a51b49e48f7c4986e727b5dd78c972d30e2e16cc"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -2234,7 +2234,7 @@ checksum = "162ee34ebcb7c64a8abebc059ce0fee27c2262618d7b60ed8faf72fef13c3650"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -2337,7 +2337,7 @@ checksum = "53010ccb100b96a67bc32c0175f0ed1426b31b655d562898e57325f81c023ac0"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -3142,7 +3142,7 @@ dependencies = [
  "heck 0.5.0",
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -3515,9 +3515,9 @@ checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf"
 
 [[package]]
 name = "opentelemetry"
-version = "0.24.0"
+version = "0.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c365a63eec4f55b7efeceb724f1336f26a9cf3427b70e59e2cd2a5b947fba96"
+checksum = "570074cc999d1a58184080966e5bd3bf3a9a4af650c3b05047c2621e7405cd17"
 dependencies = [
  "futures-core",
  "futures-sink",
@@ -3529,9 +3529,9 @@ dependencies = [
 
 [[package]]
 name = "opentelemetry-http"
-version = "0.13.0"
+version = "0.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad31e9de44ee3538fb9d64fe3376c1362f406162434609e79aea2a41a0af78ab"
+checksum = "6351496aeaa49d7c267fb480678d85d1cd30c5edb20b497c48c56f62a8c14b99"
 dependencies = [
  "async-trait",
  "bytes",
@@ -3542,9 +3542,9 @@ dependencies = [
 
 [[package]]
 name = "opentelemetry-otlp"
-version = "0.17.0"
+version = "0.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6b925a602ffb916fb7421276b86756027b37ee708f9dce2dbdcc51739f07e727"
+checksum = "29e1f9c8b032d4f635c730c0efcf731d5e2530ea13fa8bef7939ddc8420696bd"
 dependencies = [
  "async-trait",
  "futures-core",
@@ -3560,9 +3560,9 @@ dependencies = [
 
 [[package]]
 name = "opentelemetry-proto"
-version = "0.7.0"
+version = "0.26.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "30ee9f20bff9c984511a02f082dc8ede839e4a9bf15cc2487c8d6fea5ad850d9"
+checksum = "c9d3968ce3aefdcca5c27e3c4ea4391b37547726a70893aab52d3de95d5f8b34"
 dependencies = [
  "opentelemetry",
  "opentelemetry_sdk",
@@ -3572,15 +3572,15 @@ dependencies = [
 
 [[package]]
 name = "opentelemetry-semantic-conventions"
-version = "0.16.0"
+version = "0.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1cefe0543875379e47eb5f1e68ff83f45cc41366a92dfd0d073d513bf68e9a05"
+checksum = "db945c1eaea8ac6a9677185357480d215bb6999faa9f691d0c4d4d641eab7a09"
 
 [[package]]
 name = "opentelemetry_sdk"
-version = "0.24.1"
+version = "0.26.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "692eac490ec80f24a17828d49b40b60f5aeaccdfe6a503f939713afd22bc28df"
+checksum = "d2c627d9f4c9cdc1f21a29ee4bfbd6028fcb8bcf2a857b43f3abdf72c9c862f3"
 dependencies = [
  "async-trait",
  "futures-channel",
@@ -3954,7 +3954,7 @@ dependencies = [
  "parquet",
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -4056,7 +4056,7 @@ checksum = "39407670928234ebc5e6e580247dd567ad73a3578460c5990f9503df207e8f07"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -4334,7 +4334,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "8d3928fb5db768cb86f891ff014f0144589297e3c6a1aba6ed7cecfdace270c7"
 dependencies = [
  "proc-macro2",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -4348,9 +4348,9 @@ dependencies = [
 
 [[package]]
 name = "proc-macro2"
-version = "1.0.78"
+version = "1.0.92"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae"
+checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
 dependencies = [
  "unicode-ident",
 ]
@@ -4424,7 +4424,7 @@ dependencies = [
  "prost",
  "prost-types",
  "regex",
- "syn 2.0.52",
+ "syn 2.0.90",
  "tempfile",
 ]
 
@@ -4438,7 +4438,7 @@ dependencies = [
  "itertools 0.12.1",
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -4992,9 +4992,9 @@ dependencies = [
 
 [[package]]
 name = "reqwest-middleware"
-version = "0.3.0"
+version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0209efb52486ad88136190094ee214759ef7507068b27992256ed6610eb71a01"
+checksum = "d1ccd3b55e711f91a9885a2fa6fbbb2e39db1776420b062efc058c6410f7e5e3"
 dependencies = [
  "anyhow",
  "async-trait",
@@ -5007,13 +5007,12 @@ dependencies = [
 
 [[package]]
 name = "reqwest-retry"
-version = "0.5.0"
+version = "0.7.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "40f342894422862af74c50e1e9601cf0931accc9c6981e5eb413c46603b616b5"
+checksum = "29c73e4195a6bfbcb174b790d9b3407ab90646976c55de58a6515da25d851178"
 dependencies = [
  "anyhow",
  "async-trait",
- "chrono",
  "futures",
  "getrandom 0.2.11",
  "http 1.1.0",
@@ -5022,6 +5021,7 @@ dependencies = [
  "reqwest 0.12.4",
  "reqwest-middleware",
  "retry-policies",
+ "thiserror",
  "tokio",
  "tracing",
  "wasm-timer",
@@ -5029,9 +5029,9 @@ dependencies = [
 
 [[package]]
 name = "reqwest-tracing"
-version = "0.5.3"
+version = "0.5.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bfdd9bfa64c72233d8dd99ab7883efcdefe9e16d46488ecb9228b71a2e2ceb45"
+checksum = "ff82cf5730a1311fb9413b0bc2b8e743e0157cd73f010ab4ec374a923873b6a2"
 dependencies = [
  "anyhow",
  "async-trait",
@@ -5047,12 +5047,10 @@ dependencies = [
 
 [[package]]
 name = "retry-policies"
-version = "0.3.0"
+version = "0.4.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "493b4243e32d6eedd29f9a398896e35c6943a123b55eec97dcaee98310d25810"
+checksum = "5875471e6cab2871bc150ecb8c727db5113c9338cc3354dc5ee3425b6aa40a1c"
 dependencies = [
- "anyhow",
- "chrono",
  "rand 0.8.5",
 ]
 
@@ -5176,7 +5174,7 @@ dependencies = [
  "regex",
  "relative-path",
  "rustc_version",
- "syn 2.0.52",
+ "syn 2.0.90",
  "unicode-ident",
 ]
 
@@ -5684,7 +5682,7 @@ checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -5766,7 +5764,7 @@ dependencies = [
  "darling",
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -6139,7 +6137,7 @@ dependencies = [
  "proc-macro2",
  "quote",
  "rustversion",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -6190,9 +6188,9 @@ dependencies = [
 
 [[package]]
 name = "syn"
-version = "2.0.52"
+version = "2.0.90"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "b699d15b36d1f02c3e7c69f8ffef53de37aefae075d8488d4ba1a7788d574a07"
+checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31"
 dependencies = [
  "proc-macro2",
  "quote",
@@ -6222,7 +6220,7 @@ checksum = "c8af7666ab7b6390ab78131fb5b0fce11d6b7a6951602017c35fa82800708971"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -6300,27 +6298,27 @@ checksum = "78ea17a2dc368aeca6f554343ced1b1e31f76d63683fa8016e5844bd7a5144a1"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
 name = "thiserror"
-version = "1.0.57"
+version = "1.0.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1e45bcbe8ed29775f228095caf2cd67af7a4ccf756ebff23a306bf3e8b47b24b"
+checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
 dependencies = [
  "thiserror-impl",
 ]
 
 [[package]]
 name = "thiserror-impl"
-version = "1.0.57"
+version = "1.0.69"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a953cb265bef375dae3de6663da4d3804eee9682ea80d8e2542529b73c531c81"
+checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -6494,7 +6492,7 @@ checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -6719,7 +6717,7 @@ dependencies = [
  "prost-build",
  "prost-types",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -6756,9 +6754,9 @@ checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52"
 
 [[package]]
 name = "tracing"
-version = "0.1.40"
+version = "0.1.41"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef"
+checksum = "784e0ac535deb450455cbfa28a6f0df145ea1bb7ae51b821cf5e7927fdcfbdd0"
 dependencies = [
  "log",
  "pin-project-lite",
@@ -6779,20 +6777,20 @@ dependencies = [
 
 [[package]]
 name = "tracing-attributes"
-version = "0.1.27"
+version = "0.1.28"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7"
+checksum = "395ae124c09f9e6918a2310af6038fba074bcf474ac352496d5910dd59a2226d"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
 name = "tracing-core"
-version = "0.1.32"
+version = "0.1.33"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54"
+checksum = "e672c95779cf947c5311f83787af4fa8fffd12fb27e4993211a84bdfd9610f9c"
 dependencies = [
  "once_cell",
  "valuable",
@@ -6821,9 +6819,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-opentelemetry"
-version = "0.25.0"
+version = "0.27.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a9784ed4da7d921bc8df6963f8c80a0e4ce34ba6ba76668acadd3edbd985ff3b"
+checksum = "dc58af5d3f6c5811462cabb3289aec0093f7338e367e5a33d28c0433b3c7360b"
 dependencies = [
  "js-sys",
  "once_cell",
@@ -6839,9 +6837,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-serde"
-version = "0.1.3"
+version = "0.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "bc6b213177105856957181934e4920de57730fc69bf42c37ee5bb664d406d9e1"
+checksum = "704b1aeb7be0d0a84fc9828cae51dab5970fee5088f83d1dd7ee6f6246fc6ff1"
 dependencies = [
  "serde",
  "tracing-core",
@@ -6849,9 +6847,9 @@ dependencies = [
 
 [[package]]
 name = "tracing-subscriber"
-version = "0.3.18"
+version = "0.3.19"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b"
+checksum = "e8189decb5ac0fa7bc8b96b7cb9b2701d60d48805aca84a238004d665fcc4008"
 dependencies = [
  "matchers",
  "once_cell",
@@ -7258,7 +7256,7 @@ dependencies = [
  "once_cell",
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
  "wasm-bindgen-shared",
 ]
 
@@ -7292,7 +7290,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
  "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
@@ -7669,7 +7667,7 @@ dependencies = [
  "smallvec",
  "spki 0.7.3",
  "subtle",
- "syn 2.0.52",
+ "syn 2.0.90",
  "sync_wrapper 0.1.2",
  "tikv-jemalloc-sys",
  "time",
@@ -7769,7 +7767,7 @@ checksum = "b3c129550b3e6de3fd0ba67ba5c81818f9805e58b8d7fee80a3a59d2c9fc601a"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
@@ -7790,7 +7788,7 @@ checksum = "ce36e65b0d2999d2aafac989fb249189a141aee1f53c612c1f37d72631959f69"
 dependencies = [
  "proc-macro2",
  "quote",
- "syn 2.0.52",
+ "syn 2.0.90",
 ]
 
 [[package]]
diff --git a/Cargo.toml b/Cargo.toml
index 64c384f17a4b..036dc0105783 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -127,10 +127,10 @@ notify = "6.0.0"
 num_cpus = "1.15"
 num-traits = "0.2.15"
 once_cell = "1.13"
-opentelemetry = "0.24"
-opentelemetry_sdk = "0.24"
-opentelemetry-otlp = { version = "0.17", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
-opentelemetry-semantic-conventions = "0.16"
+opentelemetry = "0.26"
+opentelemetry_sdk = "0.26"
+opentelemetry-otlp = { version = "0.26", default-features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
+opentelemetry-semantic-conventions = "0.26"
 parking_lot = "0.12"
 parquet = { version = "53", default-features = false, features = ["zstd"] }
 parquet_derive = "53"
@@ -144,9 +144,9 @@ rand = "0.8"
 redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
 reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] }
-reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_24"] }
-reqwest-middleware = "0.3.0"
-reqwest-retry = "0.5"
+reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_26"] }
+reqwest-middleware = "0.4"
+reqwest-retry = "0.7"
 routerify = "3"
 rpds = "0.13"
 rustc-hash = "1.1.0"
@@ -191,7 +191,7 @@ tonic = {version = "0.12.3", features = ["tls", "tls-roots"]}
 tower-service = "0.3.2"
 tracing = "0.1"
 tracing-error = "0.2"
-tracing-opentelemetry = "0.25"
+tracing-opentelemetry = "0.27"
 tracing-subscriber = { version = "0.3", default-features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 try-lock = "0.2.5"
 twox-hash = { version = "1.6.3", default-features = false }

From 2dc238e5b3486a6f9e8d20d62731515a864d8281 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 2 Dec 2024 17:54:32 +0000
Subject: [PATCH 09/65] feat(proxy): emit JWT auth method and JWT issuer in
 parquet logs (#9971)

Fix the HTTP AuthMethod to accomodate the JWT authorization method.
Introduces the JWT issuer as an additional field in the parquet logs
---
 proxy/src/auth/backend/jwt.rs         | 10 +++--
 proxy/src/context/mod.rs              |  9 +++++
 proxy/src/context/parquet.rs          | 53 +++++++++++++++------------
 proxy/src/serverless/backend.rs       |  4 ++
 proxy/src/serverless/sql_over_http.rs |  3 --
 5 files changed, 49 insertions(+), 30 deletions(-)

diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index 517d4fd34bb8..a258090b1582 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -350,6 +350,13 @@ impl JwkCacheEntryLock {
         let header = base64::decode_config(header, base64::URL_SAFE_NO_PAD)?;
         let header = serde_json::from_slice::<JwtHeader<'_>>(&header)?;
 
+        let payloadb = base64::decode_config(payload, base64::URL_SAFE_NO_PAD)?;
+        let payload = serde_json::from_slice::<JwtPayload<'_>>(&payloadb)?;
+
+        if let Some(iss) = &payload.issuer {
+            ctx.set_jwt_issuer(iss.as_ref().to_owned());
+        }
+
         let sig = base64::decode_config(signature, base64::URL_SAFE_NO_PAD)?;
 
         let kid = header.key_id.ok_or(JwtError::MissingKeyId)?;
@@ -388,9 +395,6 @@ impl JwkCacheEntryLock {
             key => return Err(JwtError::UnsupportedKeyType(key.into())),
         };
 
-        let payloadb = base64::decode_config(payload, base64::URL_SAFE_NO_PAD)?;
-        let payload = serde_json::from_slice::<JwtPayload<'_>>(&payloadb)?;
-
         tracing::debug!(?payload, "JWT signature valid with claims");
 
         if let Some(aud) = expected_audience {
diff --git a/proxy/src/context/mod.rs b/proxy/src/context/mod.rs
index 4a063a5faa15..a9fb513d3ceb 100644
--- a/proxy/src/context/mod.rs
+++ b/proxy/src/context/mod.rs
@@ -57,6 +57,7 @@ struct RequestContextInner {
     application: Option<SmolStr>,
     error_kind: Option<ErrorKind>,
     pub(crate) auth_method: Option<AuthMethod>,
+    jwt_issuer: Option<String>,
     success: bool,
     pub(crate) cold_start_info: ColdStartInfo,
     pg_options: Option<StartupMessageParams>,
@@ -79,6 +80,7 @@ pub(crate) enum AuthMethod {
     ScramSha256,
     ScramSha256Plus,
     Cleartext,
+    Jwt,
 }
 
 impl Clone for RequestContext {
@@ -100,6 +102,7 @@ impl Clone for RequestContext {
             application: inner.application.clone(),
             error_kind: inner.error_kind,
             auth_method: inner.auth_method.clone(),
+            jwt_issuer: inner.jwt_issuer.clone(),
             success: inner.success,
             rejected: inner.rejected,
             cold_start_info: inner.cold_start_info,
@@ -148,6 +151,7 @@ impl RequestContext {
             application: None,
             error_kind: None,
             auth_method: None,
+            jwt_issuer: None,
             success: false,
             rejected: None,
             cold_start_info: ColdStartInfo::Unknown,
@@ -246,6 +250,11 @@ impl RequestContext {
         this.auth_method = Some(auth_method);
     }
 
+    pub(crate) fn set_jwt_issuer(&self, jwt_issuer: String) {
+        let mut this = self.0.try_lock().expect("should not deadlock");
+        this.jwt_issuer = Some(jwt_issuer);
+    }
+
     pub fn has_private_peer_addr(&self) -> bool {
         self.0
             .try_lock()
diff --git a/proxy/src/context/parquet.rs b/proxy/src/context/parquet.rs
index b375eb886e09..3105d085260d 100644
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -87,6 +87,8 @@ pub(crate) struct RequestData {
     branch: Option<String>,
     pg_options: Option<String>,
     auth_method: Option<&'static str>,
+    jwt_issuer: Option<String>,
+
     error: Option<&'static str>,
     /// Success is counted if we form a HTTP response with sql rows inside
     /// Or if we make it to proxy_pass
@@ -138,7 +140,9 @@ impl From<&RequestContextInner> for RequestData {
                 super::AuthMethod::ScramSha256 => "scram_sha_256",
                 super::AuthMethod::ScramSha256Plus => "scram_sha_256_plus",
                 super::AuthMethod::Cleartext => "cleartext",
+                super::AuthMethod::Jwt => "jwt",
             }),
+            jwt_issuer: value.jwt_issuer.clone(),
             protocol: value.protocol.as_str(),
             region: value.region,
             error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
@@ -519,6 +523,7 @@ mod tests {
             branch: Some(hex::encode(rng.gen::<[u8; 16]>())),
             pg_options: None,
             auth_method: None,
+            jwt_issuer: None,
             protocol: ["tcp", "ws", "http"][rng.gen_range(0..3)],
             region: "us-east-1",
             error: None,
@@ -599,15 +604,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1312632, 3, 6000),
-                (1312621, 3, 6000),
-                (1312680, 3, 6000),
-                (1312637, 3, 6000),
-                (1312773, 3, 6000),
-                (1312610, 3, 6000),
-                (1312404, 3, 6000),
-                (1312639, 3, 6000),
-                (437848, 1, 2000)
+                (1313105, 3, 6000),
+                (1313094, 3, 6000),
+                (1313153, 3, 6000),
+                (1313110, 3, 6000),
+                (1313246, 3, 6000),
+                (1313083, 3, 6000),
+                (1312877, 3, 6000),
+                (1313112, 3, 6000),
+                (438020, 1, 2000)
             ]
         );
 
@@ -639,11 +644,11 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1203465, 5, 10000),
-                (1203189, 5, 10000),
-                (1203490, 5, 10000),
-                (1203475, 5, 10000),
-                (1203729, 5, 10000)
+                (1204324, 5, 10000),
+                (1204048, 5, 10000),
+                (1204349, 5, 10000),
+                (1204334, 5, 10000),
+                (1204588, 5, 10000)
             ]
         );
 
@@ -668,15 +673,15 @@ mod tests {
         assert_eq!(
             file_stats,
             [
-                (1312632, 3, 6000),
-                (1312621, 3, 6000),
-                (1312680, 3, 6000),
-                (1312637, 3, 6000),
-                (1312773, 3, 6000),
-                (1312610, 3, 6000),
-                (1312404, 3, 6000),
-                (1312639, 3, 6000),
-                (437848, 1, 2000)
+                (1313105, 3, 6000),
+                (1313094, 3, 6000),
+                (1313153, 3, 6000),
+                (1313110, 3, 6000),
+                (1313246, 3, 6000),
+                (1313083, 3, 6000),
+                (1312877, 3, 6000),
+                (1313112, 3, 6000),
+                (438020, 1, 2000)
             ]
         );
 
@@ -713,7 +718,7 @@ mod tests {
         // files are smaller than the size threshold, but they took too long to fill so were flushed early
         assert_eq!(
             file_stats,
-            [(657696, 2, 3001), (657410, 2, 3000), (657206, 2, 2999)]
+            [(658014, 2, 3001), (657728, 2, 3000), (657524, 2, 2999)]
         );
 
         tmpdir.close().unwrap();
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 75909f3358d2..57846a4c2c51 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -53,6 +53,8 @@ impl PoolingBackend {
         user_info: &ComputeUserInfo,
         password: &[u8],
     ) -> Result<ComputeCredentials, AuthError> {
+        ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
+
         let user_info = user_info.clone();
         let backend = self.auth_backend.as_ref().map(|()| user_info.clone());
         let (allowed_ips, maybe_secret) = backend.get_allowed_ips_and_secret(ctx).await?;
@@ -115,6 +117,8 @@ impl PoolingBackend {
         user_info: &ComputeUserInfo,
         jwt: String,
     ) -> Result<ComputeCredentials, AuthError> {
+        ctx.set_auth_method(crate::context::AuthMethod::Jwt);
+
         match &self.auth_backend {
             crate::auth::Backend::ControlPlane(console, ()) => {
                 self.config
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index afd93d02f047..a0ca7cc60d6a 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -139,9 +139,6 @@ fn get_conn_info(
     headers: &HeaderMap,
     tls: Option<&TlsConfig>,
 ) -> Result<ConnInfoWithAuth, ConnInfoError> {
-    // HTTP only uses cleartext (for now and likely always)
-    ctx.set_auth_method(crate::context::AuthMethod::Cleartext);
-
     let connection_string = headers
         .get(&CONN_STRING)
         .ok_or(ConnInfoError::InvalidHeader(&CONN_STRING))?

From d8ebd33fe6f3cf0fb154a380e1397ff392d9437c Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 2 Dec 2024 12:06:19 -0600
Subject: [PATCH 10/65] Stop changing the value of neon.extension_server_port
 at runtime (#9972)

On reconfigure, we no longer passed a port for the extension server
which caused us to not write out the neon.extension_server_port line.
Thus, Postgres thought we were setting the port to the default value of
0. PGC_POSTMASTER GUCs cannot be set at runtime, which causes the
following log messages:

> LOG: parameter "neon.extension_server_port" cannot be changed without
restarting the server
> LOG: configuration file
"/var/db/postgres/compute/pgdata/postgresql.conf" contains errors;
unaffected changes were applied

Fixes: https://github.com/neondatabase/neon/issues/9945

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute_tools/src/bin/compute_ctl.rs |  9 ++-------
 compute_tools/src/compute.rs         | 19 +++++++------------
 compute_tools/src/config.rs          |  6 ++----
 3 files changed, 11 insertions(+), 23 deletions(-)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index b178d7abd6d6..e73ccd908e3e 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -335,6 +335,7 @@ fn wait_spec(
         pgdata: pgdata.to_string(),
         pgbin: pgbin.to_string(),
         pgversion: get_pg_version_string(pgbin),
+        http_port,
         live_config_allowed,
         state: Mutex::new(new_state),
         state_changed: Condvar::new(),
@@ -389,7 +390,6 @@ fn wait_spec(
 
     Ok(WaitSpecResult {
         compute,
-        http_port,
         resize_swap_on_bind,
         set_disk_quota_for_fs: set_disk_quota_for_fs.cloned(),
     })
@@ -397,8 +397,6 @@ fn wait_spec(
 
 struct WaitSpecResult {
     compute: Arc<ComputeNode>,
-    // passed through from ProcessCliResult
-    http_port: u16,
     resize_swap_on_bind: bool,
     set_disk_quota_for_fs: Option<String>,
 }
@@ -408,7 +406,6 @@ fn start_postgres(
     #[allow(unused_variables)] matches: &clap::ArgMatches,
     WaitSpecResult {
         compute,
-        http_port,
         resize_swap_on_bind,
         set_disk_quota_for_fs,
     }: WaitSpecResult,
@@ -481,12 +478,10 @@ fn start_postgres(
         }
     }
 
-    let extension_server_port: u16 = http_port;
-
     // Start Postgres
     let mut pg = None;
     if !prestartup_failed {
-        pg = match compute.start_compute(extension_server_port) {
+        pg = match compute.start_compute() {
             Ok(pg) => Some(pg),
             Err(err) => {
                 error!("could not start the compute node: {:#}", err);
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index da1caf1a9b2f..0d1e6d680fe9 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -79,6 +79,8 @@ pub struct ComputeNode {
     /// - we push spec and it does configuration
     /// - but then it is restarted without any spec again
     pub live_config_allowed: bool,
+    /// The port that the compute's HTTP server listens on
+    pub http_port: u16,
     /// Volatile part of the `ComputeNode`, which should be used under `Mutex`.
     /// To allow HTTP API server to serving status requests, while configuration
     /// is in progress, lock should be held only for short periods of time to do
@@ -611,11 +613,7 @@ impl ComputeNode {
     /// Do all the preparations like PGDATA directory creation, configuration,
     /// safekeepers sync, basebackup, etc.
     #[instrument(skip_all)]
-    pub fn prepare_pgdata(
-        &self,
-        compute_state: &ComputeState,
-        extension_server_port: u16,
-    ) -> Result<()> {
+    pub fn prepare_pgdata(&self, compute_state: &ComputeState) -> Result<()> {
         let pspec = compute_state.pspec.as_ref().expect("spec must be set");
         let spec = &pspec.spec;
         let pgdata_path = Path::new(&self.pgdata);
@@ -625,7 +623,7 @@ impl ComputeNode {
         config::write_postgres_conf(
             &pgdata_path.join("postgresql.conf"),
             &pspec.spec,
-            Some(extension_server_port),
+            self.http_port,
         )?;
 
         // Syncing safekeepers is only safe with primary nodes: if a primary
@@ -1243,7 +1241,7 @@ impl ComputeNode {
         // Write new config
         let pgdata_path = Path::new(&self.pgdata);
         let postgresql_conf_path = pgdata_path.join("postgresql.conf");
-        config::write_postgres_conf(&postgresql_conf_path, &spec, None)?;
+        config::write_postgres_conf(&postgresql_conf_path, &spec, self.http_port)?;
 
         // TODO(ololobus): We need a concurrency during reconfiguration as well,
         // but DB is already running and used by user. We can easily get out of
@@ -1284,10 +1282,7 @@ impl ComputeNode {
     }
 
     #[instrument(skip_all)]
-    pub fn start_compute(
-        &self,
-        extension_server_port: u16,
-    ) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> {
+    pub fn start_compute(&self) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> {
         let compute_state = self.state.lock().unwrap().clone();
         let pspec = compute_state.pspec.as_ref().expect("spec must be set");
         info!(
@@ -1362,7 +1357,7 @@ impl ComputeNode {
             info!("{:?}", remote_ext_metrics);
         }
 
-        self.prepare_pgdata(&compute_state, extension_server_port)?;
+        self.prepare_pgdata(&compute_state)?;
 
         let start_time = Utc::now();
         let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?;
diff --git a/compute_tools/src/config.rs b/compute_tools/src/config.rs
index d65fe7319401..b257c8a68f81 100644
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -37,7 +37,7 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
 pub fn write_postgres_conf(
     path: &Path,
     spec: &ComputeSpec,
-    extension_server_port: Option<u16>,
+    extension_server_port: u16,
 ) -> Result<()> {
     // File::create() destroys the file content if it exists.
     let mut file = File::create(path)?;
@@ -127,9 +127,7 @@ pub fn write_postgres_conf(
         writeln!(file, "# Managed by compute_ctl: end")?;
     }
 
-    if let Some(port) = extension_server_port {
-        writeln!(file, "neon.extension_server_port={}", port)?;
-    }
+    writeln!(file, "neon.extension_server_port={}", extension_server_port)?;
 
     // This is essential to keep this line at the end of the file,
     // because it is intended to override any settings above.

From 2e9207fdf3161799509527b6f8d4423fea718559 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Mon, 2 Dec 2024 19:46:06 +0100
Subject: [PATCH 11/65] fix(testing): Use 1 MB shared_buffers even with LFC
 (#9969)

## Problem

After enabling LFC in tests and lowering `shared_buffers` we started
having more problems with `test_pg_regress`.

## Summary of changes

Set `shared_buffers` to 1MB to both exercise getPage requests/LFC, and
still have enough room for Postgres to operate. Everything smaller might
be not enough for Postgres under load, and can cause errors like 'no
unpinned buffers available'.

See Konstantin's comment [1] as well.

Fixes #9956

[1]:
https://github.com/neondatabase/neon/issues/9956#issuecomment-2511608097
---
 control_plane/src/endpoint.rs         | 4 ++++
 test_runner/fixtures/neon_fixtures.py | 6 ++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 71514daa7cc5..1ca6dc43c4cf 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -310,6 +310,10 @@ impl Endpoint {
         conf.append("wal_log_hints", "off");
         conf.append("max_replication_slots", "10");
         conf.append("hot_standby", "on");
+        // Set to 1MB to both exercise getPage requests/LFC, and still have enough room for
+        // Postgres to operate. Everything smaller might be not enough for Postgres under load,
+        // and can cause errors like 'no unpinned buffers available', see
+        // <https://github.com/neondatabase/neon/issues/9956>
         conf.append("shared_buffers", "1MB");
         conf.append("fsync", "off");
         conf.append("max_connections", "100");
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5709a3b82b96..f55f06bebc00 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3801,13 +3801,11 @@ def create(
                     assert size_to_bytes(size) >= size_to_bytes(
                         "1MB"
                     ), "LFC size cannot be set less than 1MB"
-            # shared_buffers = 512kB to make postgres use LFC intensively
-            # neon.max_file_cache_size and neon.file_cache size limit are
-            # set to 1MB because small LFC is better for testing (helps to find more problems)
             lfc_path_escaped = str(lfc_path).replace("'", "''")
             config_lines = [
-                "shared_buffers = 512kB",
                 f"neon.file_cache_path = '{lfc_path_escaped}'",
+                # neon.max_file_cache_size and neon.file_cache size limits are
+                # set to 1MB because small LFC is better for testing (helps to find more problems)
                 "neon.max_file_cache_size = 1MB",
                 "neon.file_cache_size_limit = 1MB",
             ] + config_lines

From aaee713e538c6541f9a54c4aef299762d5081b16 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 3 Dec 2024 08:59:38 +0000
Subject: [PATCH 12/65] storcon: use proper schedule context during node delete
 (#9958)

## Problem

I was touching `test_storage_controller_node_deletion` because for AZ
scheduling work I was adding a change to the storage controller (kick
secondaries during optimisation) that made a FIXME in this test defunct.
While looking at it I also realized that we can easily fix the way node
deletion currently doesn't use a proper ScheduleContext, using the
iterator type recently added for that purpose.

## Summary of changes

- A testing-only behavior in storage controller where if a secondary
location isn't yet ready during optimisation, it will be actively
polled.
- Remove workaround in `test_storage_controller_node_deletion` that
previously was needed because optimisation would get stuck on cold
secondaries.
- Update node deletion code to use a `TenantShardContextIterator` and
thereby a proper ScheduleContext
---
 storage_controller/src/service.rs             | 114 ++++++++++++++----
 test_runner/regress/test_sharding.py          |   7 ++
 .../regress/test_storage_controller.py        |   8 +-
 3 files changed, 97 insertions(+), 32 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 631fdb49239c..52c9c4710d89 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -5158,34 +5158,38 @@ impl Service {
                 *nodes = Arc::new(nodes_mut);
             }
 
-            for (tenant_shard_id, shard) in tenants {
-                if shard.deref_node(node_id) {
-                    // FIXME: we need to build a ScheduleContext that reflects this shard's peers, otherwise
-                    // it won't properly do anti-affinity.
-                    let mut schedule_context = ScheduleContext::default();
-
-                    if let Err(e) = shard.schedule(scheduler, &mut schedule_context) {
-                        // TODO: implement force flag to remove a node even if we can't reschedule
-                        // a tenant
-                        tracing::error!("Refusing to delete node, shard {tenant_shard_id} can't be rescheduled: {e}");
-                        return Err(e.into());
-                    } else {
-                        tracing::info!(
-                            "Rescheduled shard {tenant_shard_id} away from node during deletion"
-                        )
+            for (_tenant_id, mut schedule_context, shards) in
+                TenantShardContextIterator::new(tenants, ScheduleMode::Normal)
+            {
+                for shard in shards {
+                    if shard.deref_node(node_id) {
+                        if let Err(e) = shard.schedule(scheduler, &mut schedule_context) {
+                            // TODO: implement force flag to remove a node even if we can't reschedule
+                            // a tenant
+                            tracing::error!(
+                                "Refusing to delete node, shard {} can't be rescheduled: {e}",
+                                shard.tenant_shard_id
+                            );
+                            return Err(e.into());
+                        } else {
+                            tracing::info!(
+                                "Rescheduled shard {} away from node during deletion",
+                                shard.tenant_shard_id
+                            )
+                        }
+
+                        self.maybe_reconcile_shard(shard, nodes);
                     }
 
-                    self.maybe_reconcile_shard(shard, nodes);
+                    // Here we remove an existing observed location for the node we're removing, and it will
+                    // not be re-added by a reconciler's completion because we filter out removed nodes in
+                    // process_result.
+                    //
+                    // Note that we update the shard's observed state _after_ calling maybe_reconcile_shard: that
+                    // means any reconciles we spawned will know about the node we're deleting, enabling them
+                    // to do live migrations if it's still online.
+                    shard.observed.locations.remove(&node_id);
                 }
-
-                // Here we remove an existing observed location for the node we're removing, and it will
-                // not be re-added by a reconciler's completion because we filter out removed nodes in
-                // process_result.
-                //
-                // Note that we update the shard's observed state _after_ calling maybe_reconcile_shard: that
-                // means any reconciles we spawned will know about the node we're deleting, enabling them
-                // to do live migrations if it's still online.
-                shard.observed.locations.remove(&node_id);
             }
 
             scheduler.node_remove(node_id);
@@ -6279,6 +6283,14 @@ impl Service {
                             > DOWNLOAD_FRESHNESS_THRESHOLD
                     {
                         tracing::info!("Skipping migration of {tenant_shard_id} to {node} because secondary isn't ready: {progress:?}");
+
+                        #[cfg(feature = "testing")]
+                        if progress.heatmap_mtime.is_none() {
+                            // No heatmap might mean the attached location has never uploaded one, or that
+                            // the secondary download hasn't happened yet.  This is relatively unusual in the field,
+                            // but fairly common in tests.
+                            self.kick_secondary_download(tenant_shard_id).await;
+                        }
                     } else {
                         // Location looks ready: proceed
                         tracing::info!(
@@ -6293,6 +6305,58 @@ impl Service {
         validated_work
     }
 
+    /// Some aspects of scheduling optimisation wait for secondary locations to be warm.  This
+    /// happens on multi-minute timescales in the field, which is fine because optimisation is meant
+    /// to be a lazy background thing. However, when testing, it is not practical to wait around, so
+    /// we have this helper to move things along faster.
+    #[cfg(feature = "testing")]
+    async fn kick_secondary_download(&self, tenant_shard_id: TenantShardId) {
+        let (attached_node, secondary_node) = {
+            let locked = self.inner.read().unwrap();
+            let Some(shard) = locked.tenants.get(&tenant_shard_id) else {
+                return;
+            };
+            let (Some(attached), Some(secondary)) = (
+                shard.intent.get_attached(),
+                shard.intent.get_secondary().first(),
+            ) else {
+                return;
+            };
+            (
+                locked.nodes.get(attached).unwrap().clone(),
+                locked.nodes.get(secondary).unwrap().clone(),
+            )
+        };
+
+        // Make remote API calls to upload + download heatmaps: we ignore errors because this is just
+        // a 'kick' to let scheduling optimisation run more promptly.
+        attached_node
+            .with_client_retries(
+                |client| async move { client.tenant_heatmap_upload(tenant_shard_id).await },
+                &self.config.jwt_token,
+                3,
+                10,
+                SHORT_RECONCILE_TIMEOUT,
+                &self.cancel,
+            )
+            .await;
+
+        secondary_node
+            .with_client_retries(
+                |client| async move {
+                    client
+                        .tenant_secondary_download(tenant_shard_id, Some(Duration::from_secs(1)))
+                        .await
+                },
+                &self.config.jwt_token,
+                3,
+                10,
+                SHORT_RECONCILE_TIMEOUT,
+                &self.cancel,
+            )
+            .await;
+    }
+
     /// Look for shards which are oversized and in need of splitting
     async fn autosplit_tenants(self: &Arc<Self>) {
         let Some(split_threshold) = self.config.split_threshold else {
diff --git a/test_runner/regress/test_sharding.py b/test_runner/regress/test_sharding.py
index c86ba0d4ea65..30abf91d3a6f 100644
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -519,6 +519,13 @@ def test_sharding_split_smoke(
     # We will have 2 shards per pageserver once done (including secondaries)
     neon_env_builder.num_pageservers = split_shard_count
 
+    # Two AZs
+    def assign_az(ps_cfg):
+        az = f"az-{(ps_cfg['id'] - 1) % 2}"
+        ps_cfg["availability_zone"] = az
+
+    neon_env_builder.pageserver_config_override = assign_az
+
     # 1MiB stripes: enable getting some meaningful data distribution without
     # writing large quantities of data in this test.  The stripe size is given
     # in number of 8KiB pages.
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index e93e251b4fa7..685af5caaf98 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -2253,12 +2253,7 @@ def assert_victim_evacuated():
             assert victim.id not in shard["node_secondary"]
 
     # Reconciles running during deletion should all complete
-    # FIXME: this currently doesn't work because the deletion schedules shards without a proper ScheduleContext, resulting
-    # in states that background_reconcile wants to optimize, but can't proceed with migrations yet because this is a short3
-    # test that hasn't uploaded any heatmaps for secondaries.
-    # In the interim, just do a reconcile_all to enable the consistency check.
-    # env.storage_controller.reconcile_until_idle()
-    env.storage_controller.reconcile_all()
+    env.storage_controller.reconcile_until_idle()
 
     # Controller should pass its own consistency checks
     env.storage_controller.consistency_check()
@@ -2267,7 +2262,6 @@ def assert_victim_evacuated():
     env.storage_controller.stop()
     env.storage_controller.start()
     assert victim.id not in [n["id"] for n in env.storage_controller.node_list()]
-    env.storage_controller.reconcile_all()  # FIXME: workaround for optimizations happening on startup, see FIXME above.
     env.storage_controller.consistency_check()
 
 
From 15d01b257ac3bf4d21347d4341fb61a147ee8ecb Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 3 Dec 2024 11:55:13 +0100
Subject: [PATCH 13/65] storcon_cli tenant-describe: include tenant-wide
 information in output (#9899)

Before this PR, the storcon_cli didn't have a way to show the
tenant-wide information of the TenantDescribeResponse.

Sadly, the `Serialize` impl for the tenant config doesn't skip on
`None`, so, the output becomes a bit bloated.
Maybe we can use `skip_serializing_if(Option::is_none)` in the future.
=> https://github.com/neondatabase/neon/issues/9983
---
 control_plane/storcon_cli/src/main.rs          | 16 ++++++++++++++--
 test_runner/regress/test_storage_controller.py |  4 ++--
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/control_plane/storcon_cli/src/main.rs b/control_plane/storcon_cli/src/main.rs
index b7f38c628660..e87942453260 100644
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -560,14 +560,26 @@ async fn main() -> anyhow::Result<()> {
                 .await?;
         }
         Command::TenantDescribe { tenant_id } => {
-            let describe_response = storcon_client
+            let TenantDescribeResponse {
+                tenant_id,
+                shards,
+                stripe_size,
+                policy,
+                config,
+            } = storcon_client
                 .dispatch::<(), TenantDescribeResponse>(
                     Method::GET,
                     format!("control/v1/tenant/{tenant_id}"),
                     None,
                 )
                 .await?;
-            let shards = describe_response.shards;
+            println!("Tenant {tenant_id}");
+            let mut table = comfy_table::Table::new();
+            table.add_row(["Policy", &format!("{:?}", policy)]);
+            table.add_row(["Stripe size", &format!("{:?}", stripe_size)]);
+            table.add_row(["Config", &serde_json::to_string_pretty(&config).unwrap()]);
+            println!("{table}");
+            println!("Shards:");
             let mut table = comfy_table::Table::new();
             table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
             for shard in shards {
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 685af5caaf98..244893a616d4 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -1747,8 +1747,8 @@ def storcon_cli(args):
 
     # Describe a tenant
     tenant_lines = storcon_cli(["tenant-describe", "--tenant-id", str(env.initial_tenant)])
-    assert len(tenant_lines) == 3 + shard_count * 2
-    assert str(env.initial_tenant) in tenant_lines[3]
+    assert len(tenant_lines) >= 3 + shard_count * 2
+    assert str(env.initial_tenant) in tenant_lines[0]
 
     # Pause changes on a tenant
     storcon_cli(["tenant-policy", "--tenant-id", str(env.initial_tenant), "--scheduling", "stop"])

From cb10be710dd4c4dd513bb3a16a77ae2800cbc888 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 3 Dec 2024 12:03:23 +0100
Subject: [PATCH 14/65] page_service: batching observability & include
 throttled time in smgr metrics (#9870)

This PR

- fixes smgr metrics https://github.com/neondatabase/neon/issues/9925
- adds an additional startup log line logging the current batching
config
- adds a histogram of batch sizes global and per-tenant
- adds a metric exposing the current batching config

The issue described #9925 is that before this PR, request latency was
only observed *after* batching.
This means that smgr latency metrics (most importantly getpage latency)
don't account for
- `wait_lsn` time
- time spent waiting for batch to fill up / the executor stage to pick
up the batch.

The fix is to use a per-request batching timer, like we did before the
initial batching PR.
We funnel those timers through the entire request lifecycle.

I noticed that even before the initial batching changes, we weren't
accounting for the time spent writing & flushing the response to the
wire.
This PR drive-by fixes that deficiency by dropping the timers at the
very end of processing the batch, i.e., after the `pgb.flush()` call.

I was **unable to maintain the behavior that we deduct
time-spent-in-throttle from various latency metrics.
The reason is that we're using a *single* counter in `RequestContext` to
track micros spent in throttle.
But there are *N* metrics timers in the batch, one per request.
As a consequence, the practice of consuming the counter in the drop
handler of each timer no longer works because all but the first timer
will encounter error `close() called on closed state`.
A failed attempt to maintain the current behavior can be found in
https://github.com/neondatabase/neon/pull/9951.

So, this PR remvoes the deduction behavior from all metrics.
I started a discussion on Slack about it the implications this has for
our internal SLO calculation:
https://neondb.slack.com/archives/C033RQ5SPDH/p1732910861704029

# Refs

- fixes https://github.com/neondatabase/neon/issues/9925
- sub-issue https://github.com/neondatabase/neon/issues/9377
- epic: https://github.com/neondatabase/neon/issues/9376
---
 pageserver/src/bin/pageserver.rs              |   3 +-
 pageserver/src/context.rs                     |   5 -
 pageserver/src/context/optional_counter.rs    | 101 ------
 pageserver/src/metrics.rs                     | 249 +++++++-------
 pageserver/src/page_service.rs                | 311 +++++++++++-------
 pageserver/src/pgdatadir_mapping.rs           |  18 +-
 pageserver/src/tenant/throttle.rs             |  17 +-
 pageserver/src/tenant/timeline.rs             |   7 +-
 test_runner/fixtures/metrics.py               |   1 +
 .../pageserver/test_page_service_batching.py  |  28 +-
 .../test_pageserver_getpage_throttle.py       |  31 +-
 11 files changed, 374 insertions(+), 397 deletions(-)
 delete mode 100644 pageserver/src/context/optional_counter.rs

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index a8c2c2e99278..31f437085519 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -127,6 +127,7 @@ fn main() -> anyhow::Result<()> {
     info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
     info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode");
     info!(?conf.wal_receiver_protocol, "starting with WAL receiver protocol");
+    info!(?conf.page_service_pipelining, "starting with page service pipelining config");
 
     // The tenants directory contains all the pageserver local disk state.
     // Create if not exists and make sure all the contents are durable before proceeding.
@@ -302,7 +303,7 @@ fn start_pageserver(
         pageserver::metrics::tokio_epoll_uring::Collector::new(),
     ))
     .unwrap();
-    pageserver::preinitialize_metrics();
+    pageserver::preinitialize_metrics(conf);
 
     // If any failpoints were set from FAILPOINTS environment variable,
     // print them to the log for debugging purposes
diff --git a/pageserver/src/context.rs b/pageserver/src/context.rs
index 7afcf52cf29e..8f2177fe5b22 100644
--- a/pageserver/src/context.rs
+++ b/pageserver/src/context.rs
@@ -91,8 +91,6 @@
 
 use crate::task_mgr::TaskKind;
 
-pub(crate) mod optional_counter;
-
 // The main structure of this module, see module-level comment.
 #[derive(Debug)]
 pub struct RequestContext {
@@ -100,7 +98,6 @@ pub struct RequestContext {
     download_behavior: DownloadBehavior,
     access_stats_behavior: AccessStatsBehavior,
     page_content_kind: PageContentKind,
-    pub micros_spent_throttled: optional_counter::MicroSecondsCounterU32,
 }
 
 /// The kind of access to the page cache.
@@ -158,7 +155,6 @@ impl RequestContextBuilder {
                 download_behavior: DownloadBehavior::Download,
                 access_stats_behavior: AccessStatsBehavior::Update,
                 page_content_kind: PageContentKind::Unknown,
-                micros_spent_throttled: Default::default(),
             },
         }
     }
@@ -172,7 +168,6 @@ impl RequestContextBuilder {
                 download_behavior: original.download_behavior,
                 access_stats_behavior: original.access_stats_behavior,
                 page_content_kind: original.page_content_kind,
-                micros_spent_throttled: Default::default(),
             },
         }
     }
diff --git a/pageserver/src/context/optional_counter.rs b/pageserver/src/context/optional_counter.rs
deleted file mode 100644
index 100c649f18cb..000000000000
--- a/pageserver/src/context/optional_counter.rs
+++ /dev/null
@@ -1,101 +0,0 @@
-use std::{
-    sync::atomic::{AtomicU32, Ordering},
-    time::Duration,
-};
-
-#[derive(Debug)]
-pub struct CounterU32 {
-    inner: AtomicU32,
-}
-impl Default for CounterU32 {
-    fn default() -> Self {
-        Self {
-            inner: AtomicU32::new(u32::MAX),
-        }
-    }
-}
-impl CounterU32 {
-    pub fn open(&self) -> Result<(), &'static str> {
-        match self
-            .inner
-            .compare_exchange(u32::MAX, 0, Ordering::Relaxed, Ordering::Relaxed)
-        {
-            Ok(_) => Ok(()),
-            Err(_) => Err("open() called on clsoed state"),
-        }
-    }
-    pub fn close(&self) -> Result<u32, &'static str> {
-        match self.inner.swap(u32::MAX, Ordering::Relaxed) {
-            u32::MAX => Err("close() called on closed state"),
-            x => Ok(x),
-        }
-    }
-
-    pub fn add(&self, count: u32) -> Result<(), &'static str> {
-        if count == 0 {
-            return Ok(());
-        }
-        let mut had_err = None;
-        self.inner
-            .fetch_update(Ordering::Relaxed, Ordering::Relaxed, |cur| match cur {
-                u32::MAX => {
-                    had_err = Some("add() called on closed state");
-                    None
-                }
-                x => {
-                    let (new, overflowed) = x.overflowing_add(count);
-                    if new == u32::MAX || overflowed {
-                        had_err = Some("add() overflowed the counter");
-                        None
-                    } else {
-                        Some(new)
-                    }
-                }
-            })
-            .map_err(|_| had_err.expect("we set it whenever the function returns None"))
-            .map(|_| ())
-    }
-}
-
-#[derive(Default, Debug)]
-pub struct MicroSecondsCounterU32 {
-    inner: CounterU32,
-}
-
-impl MicroSecondsCounterU32 {
-    pub fn open(&self) -> Result<(), &'static str> {
-        self.inner.open()
-    }
-    pub fn add(&self, duration: Duration) -> Result<(), &'static str> {
-        match duration.as_micros().try_into() {
-            Ok(x) => self.inner.add(x),
-            Err(_) => Err("add(): duration conversion error"),
-        }
-    }
-    pub fn close_and_checked_sub_from(&self, from: Duration) -> Result<Duration, &'static str> {
-        let val = self.inner.close()?;
-        let val = Duration::from_micros(val as u64);
-        let subbed = match from.checked_sub(val) {
-            Some(v) => v,
-            None => return Err("Duration::checked_sub"),
-        };
-        Ok(subbed)
-    }
-}
-
-#[cfg(test)]
-mod tests {
-
-    use super::*;
-
-    #[test]
-    fn test_basic() {
-        let counter = MicroSecondsCounterU32::default();
-        counter.open().unwrap();
-        counter.add(Duration::from_micros(23)).unwrap();
-        let res = counter
-            .close_and_checked_sub_from(Duration::from_micros(42))
-            .unwrap();
-        assert_eq!(res, Duration::from_micros(42 - 23));
-    }
-}
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 86be97587fef..d04fae762770 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -7,6 +7,10 @@ use metrics::{
     IntCounterPairVec, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
+use pageserver_api::config::{
+    PageServicePipeliningConfig, PageServicePipeliningConfigPipelined,
+    PageServiceProtocolPipelinedExecutionStrategy,
+};
 use pageserver_api::shard::TenantShardId;
 use postgres_backend::{is_expected_io_error, QueryError};
 use pq_proto::framed::ConnectionError;
@@ -1216,50 +1220,21 @@ pub(crate) mod virtual_file_io_engine {
     });
 }
 
-struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
-    global_latency_histo: &'a Histogram,
+pub(crate) struct SmgrOpTimer {
+    global_latency_histo: Histogram,
 
     // Optional because not all op types are tracked per-timeline
-    per_timeline_latency_histo: Option<&'a Histogram>,
+    per_timeline_latency_histo: Option<Histogram>,
 
-    ctx: &'c RequestContext,
-    start: std::time::Instant,
-    op: SmgrQueryType,
-    count: usize,
+    start: Instant,
 }
 
-impl Drop for GlobalAndPerTimelineHistogramTimer<'_, '_> {
+impl Drop for SmgrOpTimer {
     fn drop(&mut self) {
-        let elapsed = self.start.elapsed();
-        let ex_throttled = self
-            .ctx
-            .micros_spent_throttled
-            .close_and_checked_sub_from(elapsed);
-        let ex_throttled = match ex_throttled {
-            Ok(res) => res,
-            Err(error) => {
-                use utils::rate_limit::RateLimit;
-                static LOGGED: Lazy<Mutex<enum_map::EnumMap<SmgrQueryType, RateLimit>>> =
-                    Lazy::new(|| {
-                        Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| {
-                            RateLimit::new(Duration::from_secs(10))
-                        })))
-                    });
-                let mut guard = LOGGED.lock().unwrap();
-                let rate_limit = &mut guard[self.op];
-                rate_limit.call(|| {
-                    warn!(op=?self.op, error, "error deducting time spent throttled; this message is logged at a global rate limit");
-                });
-                elapsed
-            }
-        };
-
-        for _ in 0..self.count {
-            self.global_latency_histo
-                .observe(ex_throttled.as_secs_f64());
-            if let Some(per_timeline_getpage_histo) = self.per_timeline_latency_histo {
-                per_timeline_getpage_histo.observe(ex_throttled.as_secs_f64());
-            }
+        let elapsed = self.start.elapsed().as_secs_f64();
+        self.global_latency_histo.observe(elapsed);
+        if let Some(per_timeline_getpage_histo) = &self.per_timeline_latency_histo {
+            per_timeline_getpage_histo.observe(elapsed);
         }
     }
 }
@@ -1289,6 +1264,8 @@ pub(crate) struct SmgrQueryTimePerTimeline {
     global_latency: [Histogram; SmgrQueryType::COUNT],
     per_timeline_getpage_started: IntCounter,
     per_timeline_getpage_latency: Histogram,
+    global_batch_size: Histogram,
+    per_timeline_batch_size: Histogram,
 }
 
 static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
@@ -1381,6 +1358,76 @@ static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+static PAGE_SERVICE_BATCH_SIZE_BUCKETS_GLOBAL: Lazy<Vec<f64>> = Lazy::new(|| {
+    (1..=u32::try_from(Timeline::MAX_GET_VECTORED_KEYS).unwrap())
+        .map(|v| v.into())
+        .collect()
+});
+
+static PAGE_SERVICE_BATCH_SIZE_GLOBAL: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_page_service_batch_size_global",
+        "Batch size of pageserver page service requests",
+        PAGE_SERVICE_BATCH_SIZE_BUCKETS_GLOBAL.clone(),
+    )
+    .expect("failed to define a metric")
+});
+
+static PAGE_SERVICE_BATCH_SIZE_BUCKETS_PER_TIMELINE: Lazy<Vec<f64>> = Lazy::new(|| {
+    let mut buckets = Vec::new();
+    for i in 0.. {
+        let bucket = 1 << i;
+        if bucket > u32::try_from(Timeline::MAX_GET_VECTORED_KEYS).unwrap() {
+            break;
+        }
+        buckets.push(bucket.into());
+    }
+    buckets
+});
+
+static PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_page_service_batch_size",
+        "Batch size of pageserver page service requests",
+        &["tenant_id", "shard_id", "timeline_id"],
+        PAGE_SERVICE_BATCH_SIZE_BUCKETS_PER_TIMELINE.clone()
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "pageserver_page_service_config_max_batch_size",
+        "Configured maximum batch size for the server-side batching functionality of page_service. \
+         Labels expose more of the configuration parameters.",
+        &["mode", "execution"]
+    )
+    .expect("failed to define a metric")
+});
+
+fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) {
+    PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE.reset();
+    let (label_values, value) = match conf {
+        PageServicePipeliningConfig::Serial => (["serial", "-"], 1),
+        PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
+            max_batch_size,
+            execution,
+        }) => {
+            let mode = "pipelined";
+            let execution = match execution {
+                PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures => {
+                    "concurrent-futures"
+                }
+                PageServiceProtocolPipelinedExecutionStrategy::Tasks => "tasks",
+            };
+            ([mode, execution], max_batch_size.get())
+        }
+    };
+    PAGE_SERVICE_CONFIG_MAX_BATCH_SIZE
+        .with_label_values(&label_values)
+        .set(value.try_into().unwrap());
+}
+
 impl SmgrQueryTimePerTimeline {
     pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
         let tenant_id = tenant_shard_id.tenant_id.to_string();
@@ -1416,78 +1463,51 @@ impl SmgrQueryTimePerTimeline {
             ])
             .unwrap();
 
+        let global_batch_size = PAGE_SERVICE_BATCH_SIZE_GLOBAL.clone();
+        let per_timeline_batch_size = PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE
+            .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
+            .unwrap();
+
         Self {
             global_started,
             global_latency,
             per_timeline_getpage_latency,
             per_timeline_getpage_started,
+            global_batch_size,
+            per_timeline_batch_size,
         }
     }
-    pub(crate) fn start_timer<'c: 'a, 'a>(
-        &'a self,
-        op: SmgrQueryType,
-        ctx: &'c RequestContext,
-    ) -> Option<impl Drop + 'a> {
-        self.start_timer_many(op, 1, ctx)
-    }
-    pub(crate) fn start_timer_many<'c: 'a, 'a>(
-        &'a self,
-        op: SmgrQueryType,
-        count: usize,
-        ctx: &'c RequestContext,
-    ) -> Option<impl Drop + 'a> {
-        let start = Instant::now();
-
+    pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, started_at: Instant) -> SmgrOpTimer {
         self.global_started[op as usize].inc();
 
-        // We subtract time spent throttled from the observed latency.
-        match ctx.micros_spent_throttled.open() {
-            Ok(()) => (),
-            Err(error) => {
-                use utils::rate_limit::RateLimit;
-                static LOGGED: Lazy<Mutex<enum_map::EnumMap<SmgrQueryType, RateLimit>>> =
-                    Lazy::new(|| {
-                        Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| {
-                            RateLimit::new(Duration::from_secs(10))
-                        })))
-                    });
-                let mut guard = LOGGED.lock().unwrap();
-                let rate_limit = &mut guard[op];
-                rate_limit.call(|| {
-                    warn!(?op, error, "error opening micros_spent_throttled; this message is logged at a global rate limit");
-                });
-            }
-        }
-
         let per_timeline_latency_histo = if matches!(op, SmgrQueryType::GetPageAtLsn) {
             self.per_timeline_getpage_started.inc();
-            Some(&self.per_timeline_getpage_latency)
+            Some(self.per_timeline_getpage_latency.clone())
         } else {
             None
         };
 
-        Some(GlobalAndPerTimelineHistogramTimer {
-            global_latency_histo: &self.global_latency[op as usize],
+        SmgrOpTimer {
+            global_latency_histo: self.global_latency[op as usize].clone(),
             per_timeline_latency_histo,
-            ctx,
-            start,
-            op,
-            count,
-        })
+            start: started_at,
+        }
+    }
+
+    pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) {
+        self.global_batch_size.observe(batch_size as f64);
+        self.per_timeline_batch_size.observe(batch_size as f64);
     }
 }
 
 #[cfg(test)]
 mod smgr_query_time_tests {
+    use std::time::Instant;
+
     use pageserver_api::shard::TenantShardId;
     use strum::IntoEnumIterator;
     use utils::id::{TenantId, TimelineId};
 
-    use crate::{
-        context::{DownloadBehavior, RequestContext},
-        task_mgr::TaskKind,
-    };
-
     // Regression test, we used hard-coded string constants before using an enum.
     #[test]
     fn op_label_name() {
@@ -1531,8 +1551,7 @@ mod smgr_query_time_tests {
             let (pre_global, pre_per_tenant_timeline) = get_counts();
             assert_eq!(pre_per_tenant_timeline, 0);
 
-            let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Download);
-            let timer = metrics.start_timer(*op, &ctx);
+            let timer = metrics.start_smgr_op(*op, Instant::now());
             drop(timer);
 
             let (post_global, post_per_tenant_timeline) = get_counts();
@@ -1579,58 +1598,24 @@ pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|
     }
 });
 
-pub(crate) struct BasebackupQueryTimeOngoingRecording<'a, 'c> {
+pub(crate) struct BasebackupQueryTimeOngoingRecording<'a> {
     parent: &'a BasebackupQueryTime,
-    ctx: &'c RequestContext,
     start: std::time::Instant,
 }
 
 impl BasebackupQueryTime {
-    pub(crate) fn start_recording<'c: 'a, 'a>(
-        &'a self,
-        ctx: &'c RequestContext,
-    ) -> BasebackupQueryTimeOngoingRecording<'a, 'a> {
+    pub(crate) fn start_recording(&self) -> BasebackupQueryTimeOngoingRecording<'_> {
         let start = Instant::now();
-        match ctx.micros_spent_throttled.open() {
-            Ok(()) => (),
-            Err(error) => {
-                use utils::rate_limit::RateLimit;
-                static LOGGED: Lazy<Mutex<RateLimit>> =
-                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
-                let mut rate_limit = LOGGED.lock().unwrap();
-                rate_limit.call(|| {
-                    warn!(error, "error opening micros_spent_throttled; this message is logged at a global rate limit");
-                });
-            }
-        }
         BasebackupQueryTimeOngoingRecording {
             parent: self,
-            ctx,
             start,
         }
     }
 }
 
-impl BasebackupQueryTimeOngoingRecording<'_, '_> {
+impl BasebackupQueryTimeOngoingRecording<'_> {
     pub(crate) fn observe<T>(self, res: &Result<T, QueryError>) {
-        let elapsed = self.start.elapsed();
-        let ex_throttled = self
-            .ctx
-            .micros_spent_throttled
-            .close_and_checked_sub_from(elapsed);
-        let ex_throttled = match ex_throttled {
-            Ok(ex_throttled) => ex_throttled,
-            Err(error) => {
-                use utils::rate_limit::RateLimit;
-                static LOGGED: Lazy<Mutex<RateLimit>> =
-                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
-                let mut rate_limit = LOGGED.lock().unwrap();
-                rate_limit.call(|| {
-                    warn!(error, "error deducting time spent throttled; this message is logged at a global rate limit");
-                });
-                elapsed
-            }
-        };
+        let elapsed = self.start.elapsed().as_secs_f64();
         // If you want to change categorize of a specific error, also change it in `log_query_error`.
         let metric = match res {
             Ok(_) => &self.parent.ok,
@@ -1641,7 +1626,7 @@ impl BasebackupQueryTimeOngoingRecording<'_, '_> {
             }
             Err(_) => &self.parent.error,
         };
-        metric.observe(ex_throttled.as_secs_f64());
+        metric.observe(elapsed);
     }
 }
 
@@ -2722,6 +2707,11 @@ impl TimelineMetrics {
             shard_id,
             timeline_id,
         ]);
+        let _ = PAGE_SERVICE_BATCH_SIZE_PER_TENANT_TIMELINE.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+        ]);
     }
 }
 
@@ -2747,10 +2737,12 @@ use std::sync::{Arc, Mutex};
 use std::task::{Context, Poll};
 use std::time::{Duration, Instant};
 
+use crate::config::PageServerConf;
 use crate::context::{PageContentKind, RequestContext};
 use crate::task_mgr::TaskKind;
 use crate::tenant::mgr::TenantSlot;
 use crate::tenant::tasks::BackgroundLoopKind;
+use crate::tenant::Timeline;
 
 /// Maintain a per timeline gauge in addition to the global gauge.
 pub(crate) struct PerTimelineRemotePhysicalSizeGauge {
@@ -3562,7 +3554,9 @@ pub(crate) fn set_tokio_runtime_setup(setup: &str, num_threads: NonZeroUsize) {
         .set(u64::try_from(num_threads.get()).unwrap());
 }
 
-pub fn preinitialize_metrics() {
+pub fn preinitialize_metrics(conf: &'static PageServerConf) {
+    set_page_service_config_max_batch_size(&conf.page_service_pipelining);
+
     // Python tests need these and on some we do alerting.
     //
     // FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of
@@ -3630,6 +3624,7 @@ pub fn preinitialize_metrics() {
         &WAL_REDO_RECORDS_HISTOGRAM,
         &WAL_REDO_BYTES_HISTOGRAM,
         &WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
+        &PAGE_SERVICE_BATCH_SIZE_GLOBAL,
     ]
     .into_iter()
     .for_each(|h| {
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 1917e7f5b7ea..64842aa5b886 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -51,7 +51,7 @@ use crate::auth::check_permission;
 use crate::basebackup::BasebackupError;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
-use crate::metrics::{self};
+use crate::metrics::{self, SmgrOpTimer};
 use crate::metrics::{ComputeCommandKind, COMPUTE_COMMANDS_COUNTERS, LIVE_CONNECTIONS};
 use crate::pgdatadir_mapping::Version;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -540,11 +540,13 @@ impl From<WaitLsnError> for QueryError {
 enum BatchedFeMessage {
     Exists {
         span: Span,
+        timer: SmgrOpTimer,
         shard: timeline::handle::Handle<TenantManagerTypes>,
         req: models::PagestreamExistsRequest,
     },
     Nblocks {
         span: Span,
+        timer: SmgrOpTimer,
         shard: timeline::handle::Handle<TenantManagerTypes>,
         req: models::PagestreamNblocksRequest,
     },
@@ -552,15 +554,17 @@ enum BatchedFeMessage {
         span: Span,
         shard: timeline::handle::Handle<TenantManagerTypes>,
         effective_request_lsn: Lsn,
-        pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>,
+        pages: smallvec::SmallVec<[(RelTag, BlockNumber, SmgrOpTimer); 1]>,
     },
     DbSize {
         span: Span,
+        timer: SmgrOpTimer,
         shard: timeline::handle::Handle<TenantManagerTypes>,
         req: models::PagestreamDbSizeRequest,
     },
     GetSlruSegment {
         span: Span,
+        timer: SmgrOpTimer,
         shard: timeline::handle::Handle<TenantManagerTypes>,
         req: models::PagestreamGetSlruSegmentRequest,
     },
@@ -632,6 +636,8 @@ impl PageServerHandler {
             msg = pgb.read_message() => { msg }
         };
 
+        let received_at = Instant::now();
+
         let copy_data_bytes = match msg? {
             Some(FeMessage::CopyData(bytes)) => bytes,
             Some(FeMessage::Terminate) => {
@@ -660,7 +666,15 @@ impl PageServerHandler {
                     .get(tenant_id, timeline_id, ShardSelector::Zero)
                     .instrument(span.clone()) // sets `shard_id` field
                     .await?;
-                BatchedFeMessage::Exists { span, shard, req }
+                let timer = shard
+                    .query_metrics
+                    .start_smgr_op(metrics::SmgrQueryType::GetRelExists, received_at);
+                BatchedFeMessage::Exists {
+                    span,
+                    timer,
+                    shard,
+                    req,
+                }
             }
             PagestreamFeMessage::Nblocks(req) => {
                 let span = tracing::info_span!(parent: parent_span, "handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.request_lsn);
@@ -668,7 +682,15 @@ impl PageServerHandler {
                     .get(tenant_id, timeline_id, ShardSelector::Zero)
                     .instrument(span.clone()) // sets `shard_id` field
                     .await?;
-                BatchedFeMessage::Nblocks { span, shard, req }
+                let timer = shard
+                    .query_metrics
+                    .start_smgr_op(metrics::SmgrQueryType::GetRelSize, received_at);
+                BatchedFeMessage::Nblocks {
+                    span,
+                    timer,
+                    shard,
+                    req,
+                }
             }
             PagestreamFeMessage::DbSize(req) => {
                 let span = tracing::info_span!(parent: parent_span, "handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.request_lsn);
@@ -676,7 +698,15 @@ impl PageServerHandler {
                     .get(tenant_id, timeline_id, ShardSelector::Zero)
                     .instrument(span.clone()) // sets `shard_id` field
                     .await?;
-                BatchedFeMessage::DbSize { span, shard, req }
+                let timer = shard
+                    .query_metrics
+                    .start_smgr_op(metrics::SmgrQueryType::GetDbSize, received_at);
+                BatchedFeMessage::DbSize {
+                    span,
+                    timer,
+                    shard,
+                    req,
+                }
             }
             PagestreamFeMessage::GetSlruSegment(req) => {
                 let span = tracing::info_span!(parent: parent_span, "handle_get_slru_segment_request", kind = %req.kind, segno = %req.segno, req_lsn = %req.request_lsn);
@@ -684,7 +714,15 @@ impl PageServerHandler {
                     .get(tenant_id, timeline_id, ShardSelector::Zero)
                     .instrument(span.clone()) // sets `shard_id` field
                     .await?;
-                BatchedFeMessage::GetSlruSegment { span, shard, req }
+                let timer = shard
+                    .query_metrics
+                    .start_smgr_op(metrics::SmgrQueryType::GetSlruSegment, received_at);
+                BatchedFeMessage::GetSlruSegment {
+                    span,
+                    timer,
+                    shard,
+                    req,
+                }
             }
             PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
                 request_lsn,
@@ -728,6 +766,14 @@ impl PageServerHandler {
                         return respond_error!(e.into());
                     }
                 };
+
+                // It's important to start the timer before waiting for the LSN
+                // so that the _started counters are incremented before we do
+                // any serious waiting, e.g., for LSNs.
+                let timer = shard
+                    .query_metrics
+                    .start_smgr_op(metrics::SmgrQueryType::GetPageAtLsn, received_at);
+
                 let effective_request_lsn = match Self::wait_or_get_last_lsn(
                     &shard,
                     request_lsn,
@@ -747,7 +793,7 @@ impl PageServerHandler {
                     span,
                     shard,
                     effective_request_lsn,
-                    pages: smallvec::smallvec![(rel, blkno)],
+                    pages: smallvec::smallvec![(rel, blkno, timer)],
                 }
             }
         };
@@ -832,88 +878,112 @@ impl PageServerHandler {
         IO: AsyncRead + AsyncWrite + Send + Sync + Unpin,
     {
         // invoke handler function
-        let (handler_results, span): (Vec<Result<PagestreamBeMessage, PageStreamError>>, _) =
-            match batch {
-                BatchedFeMessage::Exists { span, shard, req } => {
-                    fail::fail_point!("ps::handle-pagerequest-message::exists");
-                    (
-                        vec![
-                            self.handle_get_rel_exists_request(&shard, &req, ctx)
-                                .instrument(span.clone())
-                                .await,
-                        ],
-                        span,
-                    )
-                }
-                BatchedFeMessage::Nblocks { span, shard, req } => {
-                    fail::fail_point!("ps::handle-pagerequest-message::nblocks");
-                    (
-                        vec![
-                            self.handle_get_nblocks_request(&shard, &req, ctx)
-                                .instrument(span.clone())
-                                .await,
-                        ],
-                        span,
-                    )
-                }
-                BatchedFeMessage::GetPage {
+        let (handler_results, span): (
+            Vec<Result<(PagestreamBeMessage, SmgrOpTimer), PageStreamError>>,
+            _,
+        ) = match batch {
+            BatchedFeMessage::Exists {
+                span,
+                timer,
+                shard,
+                req,
+            } => {
+                fail::fail_point!("ps::handle-pagerequest-message::exists");
+                (
+                    vec![self
+                        .handle_get_rel_exists_request(&shard, &req, ctx)
+                        .instrument(span.clone())
+                        .await
+                        .map(|msg| (msg, timer))],
                     span,
-                    shard,
-                    effective_request_lsn,
-                    pages,
-                } => {
-                    fail::fail_point!("ps::handle-pagerequest-message::getpage");
-                    (
-                        {
-                            let npages = pages.len();
-                            trace!(npages, "handling getpage request");
-                            let res = self
-                                .handle_get_page_at_lsn_request_batched(
-                                    &shard,
-                                    effective_request_lsn,
-                                    pages,
-                                    ctx,
-                                )
-                                .instrument(span.clone())
-                                .await;
-                            assert_eq!(res.len(), npages);
-                            res
-                        },
-                        span,
-                    )
-                }
-                BatchedFeMessage::DbSize { span, shard, req } => {
-                    fail::fail_point!("ps::handle-pagerequest-message::dbsize");
-                    (
-                        vec![
-                            self.handle_db_size_request(&shard, &req, ctx)
-                                .instrument(span.clone())
-                                .await,
-                        ],
-                        span,
-                    )
-                }
-                BatchedFeMessage::GetSlruSegment { span, shard, req } => {
-                    fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
-                    (
-                        vec![
-                            self.handle_get_slru_segment_request(&shard, &req, ctx)
-                                .instrument(span.clone())
-                                .await,
-                        ],
-                        span,
-                    )
-                }
-                BatchedFeMessage::RespondError { span, error } => {
-                    // We've already decided to respond with an error, so we don't need to
-                    // call the handler.
-                    (vec![Err(error)], span)
-                }
-            };
+                )
+            }
+            BatchedFeMessage::Nblocks {
+                span,
+                timer,
+                shard,
+                req,
+            } => {
+                fail::fail_point!("ps::handle-pagerequest-message::nblocks");
+                (
+                    vec![self
+                        .handle_get_nblocks_request(&shard, &req, ctx)
+                        .instrument(span.clone())
+                        .await
+                        .map(|msg| (msg, timer))],
+                    span,
+                )
+            }
+            BatchedFeMessage::GetPage {
+                span,
+                shard,
+                effective_request_lsn,
+                pages,
+            } => {
+                fail::fail_point!("ps::handle-pagerequest-message::getpage");
+                (
+                    {
+                        let npages = pages.len();
+                        trace!(npages, "handling getpage request");
+                        let res = self
+                            .handle_get_page_at_lsn_request_batched(
+                                &shard,
+                                effective_request_lsn,
+                                pages,
+                                ctx,
+                            )
+                            .instrument(span.clone())
+                            .await;
+                        assert_eq!(res.len(), npages);
+                        res
+                    },
+                    span,
+                )
+            }
+            BatchedFeMessage::DbSize {
+                span,
+                timer,
+                shard,
+                req,
+            } => {
+                fail::fail_point!("ps::handle-pagerequest-message::dbsize");
+                (
+                    vec![self
+                        .handle_db_size_request(&shard, &req, ctx)
+                        .instrument(span.clone())
+                        .await
+                        .map(|msg| (msg, timer))],
+                    span,
+                )
+            }
+            BatchedFeMessage::GetSlruSegment {
+                span,
+                timer,
+                shard,
+                req,
+            } => {
+                fail::fail_point!("ps::handle-pagerequest-message::slrusegment");
+                (
+                    vec![self
+                        .handle_get_slru_segment_request(&shard, &req, ctx)
+                        .instrument(span.clone())
+                        .await
+                        .map(|msg| (msg, timer))],
+                    span,
+                )
+            }
+            BatchedFeMessage::RespondError { span, error } => {
+                // We've already decided to respond with an error, so we don't need to
+                // call the handler.
+                (vec![Err(error)], span)
+            }
+        };
 
         // Map handler result to protocol behavior.
         // Some handler errors cause exit from pagestream protocol.
         // Other handler errors are sent back as an error message and we stay in pagestream protocol.
+        let mut timers: smallvec::SmallVec<[_; 1]> =
+            smallvec::SmallVec::with_capacity(handler_results.len());
         for handler_result in handler_results {
             let response_msg = match handler_result {
                 Err(e) => match &e {
@@ -944,7 +1014,12 @@ impl PageServerHandler {
                         })
                     }
                 },
-                Ok(response_msg) => response_msg,
+                Ok((response_msg, timer)) => {
+                    // Extending the lifetime of the timers so observations on drop
+                    // include the flush time.
+                    timers.push(timer);
+                    response_msg
+                }
             };
 
             // marshal & transmit response message
@@ -961,6 +1036,7 @@ impl PageServerHandler {
                 res?;
             }
         }
+        drop(timers);
         Ok(())
     }
 
@@ -1423,10 +1499,6 @@ impl PageServerHandler {
         req: &PagestreamExistsRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let _timer = timeline
-            .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetRelExists, ctx);
-
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
             timeline,
@@ -1453,10 +1525,6 @@ impl PageServerHandler {
         req: &PagestreamNblocksRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let _timer = timeline
-            .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetRelSize, ctx);
-
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
             timeline,
@@ -1483,10 +1551,6 @@ impl PageServerHandler {
         req: &PagestreamDbSizeRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let _timer = timeline
-            .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetDbSize, ctx);
-
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
             timeline,
@@ -1512,26 +1576,41 @@ impl PageServerHandler {
         &mut self,
         timeline: &Timeline,
         effective_lsn: Lsn,
-        pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>,
+        requests: smallvec::SmallVec<[(RelTag, BlockNumber, SmgrOpTimer); 1]>,
         ctx: &RequestContext,
-    ) -> Vec<Result<PagestreamBeMessage, PageStreamError>> {
+    ) -> Vec<Result<(PagestreamBeMessage, SmgrOpTimer), PageStreamError>> {
         debug_assert_current_span_has_tenant_and_timeline_id();
-        let _timer = timeline.query_metrics.start_timer_many(
-            metrics::SmgrQueryType::GetPageAtLsn,
-            pages.len(),
-            ctx,
-        );
 
-        let pages = timeline
-            .get_rel_page_at_lsn_batched(pages, effective_lsn, ctx)
-            .await;
+        timeline
+            .query_metrics
+            .observe_getpage_batch_start(requests.len());
 
-        Vec::from_iter(pages.into_iter().map(|page| {
-            page.map(|page| {
-                PagestreamBeMessage::GetPage(models::PagestreamGetPageResponse { page })
-            })
-            .map_err(PageStreamError::from)
-        }))
+        let results = timeline
+            .get_rel_page_at_lsn_batched(
+                requests.iter().map(|(reltag, blkno, _)| (reltag, blkno)),
+                effective_lsn,
+                ctx,
+            )
+            .await;
+        assert_eq!(results.len(), requests.len());
+
+        // TODO: avoid creating the new Vec here
+        Vec::from_iter(
+            requests
+                .into_iter()
+                .zip(results.into_iter())
+                .map(|((_, _, timer), res)| {
+                    res.map(|page| {
+                        (
+                            PagestreamBeMessage::GetPage(models::PagestreamGetPageResponse {
+                                page,
+                            }),
+                            timer,
+                        )
+                    })
+                    .map_err(PageStreamError::from)
+                }),
+        )
     }
 
     #[instrument(skip_all, fields(shard_id))]
@@ -1541,10 +1620,6 @@ impl PageServerHandler {
         req: &PagestreamGetSlruSegmentRequest,
         ctx: &RequestContext,
     ) -> Result<PagestreamBeMessage, PageStreamError> {
-        let _timer = timeline
-            .query_metrics
-            .start_timer(metrics::SmgrQueryType::GetSlruSegment, ctx);
-
         let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
         let lsn = Self::wait_or_get_last_lsn(
             timeline,
@@ -2045,7 +2120,7 @@ where
                 COMPUTE_COMMANDS_COUNTERS
                     .for_command(ComputeCommandKind::Basebackup)
                     .inc();
-                let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording(&ctx);
+                let metric_recording = metrics::BASEBACKUP_QUERY_TIME.start_recording();
                 let res = async {
                     self.handle_basebackup_request(
                         pgb,
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index d48a1ba117fc..a00ec761e25c 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -203,9 +203,13 @@ impl Timeline {
     ) -> Result<Bytes, PageReconstructError> {
         match version {
             Version::Lsn(effective_lsn) => {
-                let pages = smallvec::smallvec![(tag, blknum)];
+                let pages: smallvec::SmallVec<[_; 1]> = smallvec::smallvec![(tag, blknum)];
                 let res = self
-                    .get_rel_page_at_lsn_batched(pages, effective_lsn, ctx)
+                    .get_rel_page_at_lsn_batched(
+                        pages.iter().map(|(tag, blknum)| (tag, blknum)),
+                        effective_lsn,
+                        ctx,
+                    )
                     .await;
                 assert_eq!(res.len(), 1);
                 res.into_iter().next().unwrap()
@@ -240,7 +244,7 @@ impl Timeline {
     /// The ordering of the returned vec corresponds to the ordering of `pages`.
     pub(crate) async fn get_rel_page_at_lsn_batched(
         &self,
-        pages: smallvec::SmallVec<[(RelTag, BlockNumber); 1]>,
+        pages: impl ExactSizeIterator<Item = (&RelTag, &BlockNumber)>,
         effective_lsn: Lsn,
         ctx: &RequestContext,
     ) -> Vec<Result<Bytes, PageReconstructError>> {
@@ -254,7 +258,7 @@ impl Timeline {
         let result_slots = result.spare_capacity_mut();
 
         let mut keys_slots: BTreeMap<Key, smallvec::SmallVec<[usize; 1]>> = BTreeMap::default();
-        for (response_slot_idx, (tag, blknum)) in pages.into_iter().enumerate() {
+        for (response_slot_idx, (tag, blknum)) in pages.enumerate() {
             if tag.relnode == 0 {
                 result_slots[response_slot_idx].write(Err(PageReconstructError::Other(
                     RelationError::InvalidRelnode.into(),
@@ -265,7 +269,7 @@ impl Timeline {
             }
 
             let nblocks = match self
-                .get_rel_size(tag, Version::Lsn(effective_lsn), ctx)
+                .get_rel_size(*tag, Version::Lsn(effective_lsn), ctx)
                 .await
             {
                 Ok(nblocks) => nblocks,
@@ -276,7 +280,7 @@ impl Timeline {
                 }
             };
 
-            if blknum >= nblocks {
+            if *blknum >= nblocks {
                 debug!(
                     "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
                     tag, blknum, effective_lsn, nblocks
@@ -286,7 +290,7 @@ impl Timeline {
                 continue;
             }
 
-            let key = rel_block_to_key(tag, blknum);
+            let key = rel_block_to_key(*tag, *blknum);
 
             let key_slots = keys_slots.entry(key).or_default();
             key_slots.push(response_slot_idx);
diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs
index 6a8095390177..7c4de55a476e 100644
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -2,14 +2,14 @@ use std::{
     str::FromStr,
     sync::{
         atomic::{AtomicU64, Ordering},
-        Arc, Mutex,
+        Arc,
     },
     time::{Duration, Instant},
 };
 
 use arc_swap::ArcSwap;
 use enumset::EnumSet;
-use tracing::{error, warn};
+use tracing::error;
 use utils::leaky_bucket::{LeakyBucketConfig, RateLimiter};
 
 use crate::{context::RequestContext, task_mgr::TaskKind};
@@ -162,19 +162,6 @@ where
                 .fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed);
             let observation = Observation { wait_time };
             self.metric.observe_throttling(&observation);
-            match ctx.micros_spent_throttled.add(wait_time) {
-                Ok(res) => res,
-                Err(error) => {
-                    use once_cell::sync::Lazy;
-                    use utils::rate_limit::RateLimit;
-                    static WARN_RATE_LIMIT: Lazy<Mutex<RateLimit>> =
-                        Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
-                    let mut guard = WARN_RATE_LIMIT.lock().unwrap();
-                    guard.call(move || {
-                        warn!(error, "error adding time spent throttled; this message is logged at a global rate limit");
-                    });
-                }
-            }
             Some(wait_time)
         } else {
             None
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 730477a7f4dc..dc3f823f20c8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1059,7 +1059,8 @@ impl Timeline {
             .map(|metric| (metric, Instant::now()));
 
         // start counting after throttle so that throttle time
-        // is always less than observation time
+        // is always less than observation time and we don't
+        // underflow when computing `ex_throttled` below.
         let throttled = self
             .timeline_get_throttle
             .throttle(ctx, key_count as usize)
@@ -1138,7 +1139,9 @@ impl Timeline {
             .map(ScanLatencyOngoingRecording::start_recording);
 
         // start counting after throttle so that throttle time
-        // is always less than observation time
+        // is always less than observation time and we don't
+        // underflow when computing the `ex_throttled` value in
+        // `recording.observe(throttled)` below.
         let throttled = self
             .timeline_get_throttle
             // assume scan = 1 quota for now until we find a better way to process this
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 3f90c233a635..ffdbd988a58f 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -173,6 +173,7 @@ def counter(name: str) -> str:
     counter("pageserver_tenant_throttling_count_accounted_finish"),
     counter("pageserver_tenant_throttling_wait_usecs_sum"),
     counter("pageserver_tenant_throttling_count"),
+    *histogram("pageserver_page_service_batch_size"),
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
     # "pageserver_directory_entries_count", -- only used if above a certain threshold
     # "pageserver_broken_tenants_count" -- used only for broken
diff --git a/test_runner/performance/pageserver/test_page_service_batching.py b/test_runner/performance/pageserver/test_page_service_batching.py
index c47a849fec7c..562094a059d2 100644
--- a/test_runner/performance/pageserver/test_page_service_batching.py
+++ b/test_runner/performance/pageserver/test_page_service_batching.py
@@ -167,18 +167,18 @@ def test_throughput(
     @dataclass
     class Metrics:
         time: float
-        pageserver_getpage_count: float
-        pageserver_vectored_get_count: float
+        pageserver_batch_size_histo_sum: float
+        pageserver_batch_size_histo_count: float
         compute_getpage_count: float
         pageserver_cpu_seconds_total: float
 
         def __sub__(self, other: "Metrics") -> "Metrics":
             return Metrics(
                 time=self.time - other.time,
-                pageserver_getpage_count=self.pageserver_getpage_count
-                - other.pageserver_getpage_count,
-                pageserver_vectored_get_count=self.pageserver_vectored_get_count
-                - other.pageserver_vectored_get_count,
+                pageserver_batch_size_histo_sum=self.pageserver_batch_size_histo_sum
+                - other.pageserver_batch_size_histo_sum,
+                pageserver_batch_size_histo_count=self.pageserver_batch_size_histo_count
+                - other.pageserver_batch_size_histo_count,
                 compute_getpage_count=self.compute_getpage_count - other.compute_getpage_count,
                 pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total
                 - other.pageserver_cpu_seconds_total,
@@ -187,8 +187,8 @@ def __sub__(self, other: "Metrics") -> "Metrics":
         def normalize(self, by) -> "Metrics":
             return Metrics(
                 time=self.time / by,
-                pageserver_getpage_count=self.pageserver_getpage_count / by,
-                pageserver_vectored_get_count=self.pageserver_vectored_get_count / by,
+                pageserver_batch_size_histo_sum=self.pageserver_batch_size_histo_sum / by,
+                pageserver_batch_size_histo_count=self.pageserver_batch_size_histo_count / by,
                 compute_getpage_count=self.compute_getpage_count / by,
                 pageserver_cpu_seconds_total=self.pageserver_cpu_seconds_total / by,
             )
@@ -202,11 +202,11 @@ def get_metrics() -> Metrics:
             pageserver_metrics = ps_http.get_metrics()
             return Metrics(
                 time=time.time(),
-                pageserver_getpage_count=pageserver_metrics.query_one(
-                    "pageserver_smgr_query_seconds_count", {"smgr_query_type": "get_page_at_lsn"}
+                pageserver_batch_size_histo_sum=pageserver_metrics.query_one(
+                    "pageserver_page_service_batch_size_sum"
                 ).value,
-                pageserver_vectored_get_count=pageserver_metrics.query_one(
-                    "pageserver_get_vectored_seconds_count", {"task_kind": "PageRequestHandler"}
+                pageserver_batch_size_histo_count=pageserver_metrics.query_one(
+                    "pageserver_page_service_batch_size_count"
                 ).value,
                 compute_getpage_count=compute_getpage_count,
                 pageserver_cpu_seconds_total=pageserver_metrics.query_one(
@@ -243,7 +243,7 @@ def workload() -> Metrics:
     # Sanity-checks on the collected data
     #
     # assert that getpage counts roughly match between compute and ps
-    assert metrics.pageserver_getpage_count == pytest.approx(
+    assert metrics.pageserver_batch_size_histo_sum == pytest.approx(
         metrics.compute_getpage_count, rel=0.01
     )
 
@@ -256,7 +256,7 @@ def workload() -> Metrics:
 
     zenbenchmark.record(
         "perfmetric.batching_factor",
-        metrics.pageserver_getpage_count / metrics.pageserver_vectored_get_count,
+        metrics.pageserver_batch_size_histo_sum / metrics.pageserver_batch_size_histo_count,
         unit="",
         report=MetricReport.HIGHER_IS_BETTER,
     )
diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py
index ba6a1d90451e..62aec50a9ed3 100644
--- a/test_runner/regress/test_pageserver_getpage_throttle.py
+++ b/test_runner/regress/test_pageserver_getpage_throttle.py
@@ -4,6 +4,7 @@
 import json
 import uuid
 
+import pytest
 from anyio import Path
 from fixtures.common_types import TenantId, TimelineId
 from fixtures.log_helper import log
@@ -70,14 +71,21 @@ def run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs: i
 
     log.info("warmup / make sure metrics are present")
     run_pagebench_at_max_speed_and_get_total_requests_completed(2)
-    metrics_query = {
+    smgr_metrics_query = {
         "tenant_id": str(tenant_id),
         "timeline_id": str(timeline_id),
         "smgr_query_type": "get_page_at_lsn",
     }
-    metric_name = "pageserver_smgr_query_seconds_sum"
-    smgr_query_seconds_pre = ps_http.get_metric_value(metric_name, metrics_query)
+    smgr_metric_name = "pageserver_smgr_query_seconds_sum"
+    throttle_metrics_query = {
+        "tenant_id": str(tenant_id),
+    }
+    throttle_metric_name = "pageserver_tenant_throttling_wait_usecs_sum_total"
+
+    smgr_query_seconds_pre = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query)
     assert smgr_query_seconds_pre is not None
+    throttled_usecs_pre = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query)
+    assert throttled_usecs_pre is not None
 
     marker = uuid.uuid4().hex
     ps_http.post_tracing_event("info", marker)
@@ -108,14 +116,23 @@ def run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs: i
         timeout=compaction_period,
     )
 
-    log.info("validate that the metric doesn't include throttle wait time")
-    smgr_query_seconds_post = ps_http.get_metric_value(metric_name, metrics_query)
+    log.info("the smgr metric includes throttle time")
+    smgr_query_seconds_post = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query)
     assert smgr_query_seconds_post is not None
+    throttled_usecs_post = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query)
+    assert throttled_usecs_post is not None
     actual_smgr_query_seconds = smgr_query_seconds_post - smgr_query_seconds_pre
+    actual_throttled_usecs = throttled_usecs_post - throttled_usecs_pre
+    actual_throttled_secs = actual_throttled_usecs / 1_000_000
 
     assert (
-        duration_secs >= 10 * actual_smgr_query_seconds
-    ), "smgr metrics should not include throttle wait time"
+        pytest.approx(duration_secs, 0.1) == actual_smgr_query_seconds
+    ), "smgr metrics include throttle wait time"
+    smgr_ex_throttle = actual_smgr_query_seconds - actual_throttled_secs
+    assert smgr_ex_throttle > 0
+    assert (
+        duration_secs > 10 * smgr_ex_throttle
+    ), "most of the time in this test is spent throttled because the rate-limit's contribution to latency dominates"
 
 
 throttle_config_with_field_fair_set = {

From a2a942f93cedf6cbba6ea3184d39dfffe250dd47 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Tue, 3 Dec 2024 12:25:29 +0100
Subject: [PATCH 15/65] Add support for the extensions test for Postgres v17
 (#9748)

## Problem
The extensions for Postgres v17 are ready but we do not test the
extensions shipped with v17
## Summary of changes
Build the test image based on Postgres v17. Run the tests for v17.

---------

Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech>
---
 .github/workflows/build_and_test.yml          |  15 +-
 compute/compute-node.Dockerfile               |  42 ++---
 ...hint_plan.patch => pg_hint_plan_v16.patch} |   0
 compute/patches/pg_hint_plan_v17.patch        | 174 ++++++++++++++++++
 docker-compose/compute_wrapper/Dockerfile     |   6 +-
 docker-compose/docker_compose_test.sh         |  27 ++-
 6 files changed, 219 insertions(+), 45 deletions(-)
 rename compute/patches/{pg_hint_plan.patch => pg_hint_plan_v16.patch} (100%)
 create mode 100644 compute/patches/pg_hint_plan_v17.patch

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 9830c2a0c9b2..e9e111e7bdae 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -669,7 +669,7 @@ jobs:
             neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }}
 
       - name: Build neon extensions test image
-        if: matrix.version.pg == 'v16'
+        if: matrix.version.pg >= 'v16'
         uses: docker/build-push-action@v6
         with:
           context: .
@@ -684,8 +684,7 @@ jobs:
           pull: true
           file: compute/compute-node.Dockerfile
           target: neon-pg-ext-test
-          cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
-          cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/neon-test-extensions-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
+          cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
           tags: |
             neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{needs.tag.outputs.build-tag}}-${{ matrix.version.debian }}-${{ matrix.arch }}
 
@@ -708,7 +707,7 @@ jobs:
           push: true
           pull: true
           file: compute/compute-node.Dockerfile
-          cache-from: type=registry,ref=cache.neon.build/neon-test-extensions-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
+          cache-from: type=registry,ref=cache.neon.build/compute-node-${{ matrix.version.pg }}:cache-${{ matrix.version.debian }}-${{ matrix.arch }}
           cache-to: ${{ github.ref_name == 'main' && format('type=registry,ref=cache.neon.build/compute-tools-{0}:cache-{1}-{2},mode=max', matrix.version.pg, matrix.version.debian, matrix.arch) || '' }}
           tags: |
             neondatabase/compute-tools:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-${{ matrix.arch }}
@@ -744,7 +743,7 @@ jobs:
                                              neondatabase/compute-node-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }}-arm64
 
       - name: Create multi-arch neon-test-extensions image
-        if: matrix.version.pg == 'v16'
+        if: matrix.version.pg >= 'v16'
         run: |
           docker buildx imagetools create -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }} \
                                           -t neondatabase/neon-test-extensions-${{ matrix.version.pg }}:${{ needs.tag.outputs.build-tag }}-${{ matrix.version.debian }} \
@@ -833,6 +832,7 @@ jobs:
       fail-fast: false
       matrix:
         arch: [ x64, arm64 ]
+        pg_version: [v16, v17]
 
     runs-on: ${{ fromJson(format('["self-hosted", "{0}"]', matrix.arch == 'arm64' && 'small-arm64' || 'small')) }}
 
@@ -871,7 +871,10 @@ jobs:
 
       - name: Verify docker-compose example and test extensions
         timeout-minutes: 20
-        run: env TAG=${{needs.tag.outputs.build-tag}} ./docker-compose/docker_compose_test.sh
+        env:
+          TAG: ${{needs.tag.outputs.build-tag}}
+          TEST_VERSION_ONLY: ${{ matrix.pg_version }}
+        run: ./docker-compose/docker_compose_test.sh
 
       - name: Print logs and clean up
         if: always()
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index 222a0cb88b59..bf6311bf2b8d 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1367,15 +1367,12 @@ RUN make PG_VERSION="${PG_VERSION}" -C compute
 
 FROM neon-pg-ext-build AS neon-pg-ext-test
 ARG PG_VERSION
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    mkdir /ext-src
+RUN mkdir /ext-src
 
 #COPY --from=postgis-build /postgis.tar.gz /ext-src/
 #COPY --from=postgis-build /sfcgal/* /usr
 COPY --from=plv8-build /plv8.tar.gz /ext-src/
-COPY --from=h3-pg-build /h3-pg.tar.gz /ext-src/
+#COPY --from=h3-pg-build /h3-pg.tar.gz /ext-src/
 COPY --from=unit-pg-build /postgresql-unit.tar.gz /ext-src/
 COPY --from=vector-pg-build /pgvector.tar.gz /ext-src/
 COPY --from=vector-pg-build /pgvector.patch /ext-src/
@@ -1395,7 +1392,7 @@ COPY --from=hll-pg-build /hll.tar.gz /ext-src
 COPY --from=plpgsql-check-pg-build /plpgsql_check.tar.gz /ext-src
 #COPY --from=timescaledb-pg-build /timescaledb.tar.gz /ext-src
 COPY --from=pg-hint-plan-pg-build /pg_hint_plan.tar.gz /ext-src
-COPY compute/patches/pg_hint_plan.patch /ext-src
+COPY compute/patches/pg_hint_plan_${PG_VERSION}.patch /ext-src
 COPY --from=pg-cron-pg-build /pg_cron.tar.gz /ext-src
 COPY compute/patches/pg_cron.patch /ext-src
 #COPY --from=pg-pgx-ulid-build /home/nonroot/pgx_ulid.tar.gz /ext-src
@@ -1405,38 +1402,23 @@ COPY --from=pg-roaringbitmap-pg-build /pg_roaringbitmap.tar.gz /ext-src
 COPY --from=pg-semver-pg-build /pg_semver.tar.gz /ext-src
 #COPY --from=pg-embedding-pg-build /home/nonroot/pg_embedding-src/ /ext-src
 #COPY --from=wal2json-pg-build /wal2json_2_5.tar.gz /ext-src
-COPY --from=pg-anon-pg-build /pg_anon.tar.gz /ext-src
+#pg_anon is not supported yet for pg v17 so, don't fail if nothing found
+COPY --from=pg-anon-pg-build /pg_anon.tar.g? /ext-src
 COPY compute/patches/pg_anon.patch /ext-src
 COPY --from=pg-ivm-build /pg_ivm.tar.gz /ext-src
 COPY --from=pg-partman-build /pg_partman.tar.gz /ext-src
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    cd /ext-src/ && for f in *.tar.gz; \
+RUN cd /ext-src/ && for f in *.tar.gz; \
     do echo $f; dname=$(echo $f | sed 's/\.tar.*//')-src; \
     rm -rf $dname; mkdir $dname; tar xzf $f --strip-components=1 -C $dname \
     || exit 1; rm -f $f; done
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    cd /ext-src/rum-src && patch -p1 <../rum.patch
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan.patch
+RUN cd /ext-src/rum-src && patch -p1 <../rum.patch
+RUN cd /ext-src/pgvector-src && patch -p1 <../pgvector.patch
+RUN cd /ext-src/pg_hint_plan-src && patch -p1 < /ext-src/pg_hint_plan_${PG_VERSION}.patch
 COPY --chmod=755 docker-compose/run-tests.sh /run-tests.sh
 RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    patch -p1 </ext-src/pg_anon.patch
-RUN case "${PG_VERSION}" in "v17") \
-    echo "v17 extensions are not supported yet. Quit" && exit 0;; \
-    esac && \
-    patch -p1 </ext-src/pg_cron.patch
+    echo "postgresql_anonymizer does not yet support PG17" && exit 0;; \
+    esac && patch -p1 </ext-src/pg_anon.patch
+RUN patch -p1 </ext-src/pg_cron.patch
 ENV PATH=/usr/local/pgsql/bin:$PATH
 ENV PGHOST=compute
 ENV PGPORT=55433
diff --git a/compute/patches/pg_hint_plan.patch b/compute/patches/pg_hint_plan_v16.patch
similarity index 100%
rename from compute/patches/pg_hint_plan.patch
rename to compute/patches/pg_hint_plan_v16.patch
diff --git a/compute/patches/pg_hint_plan_v17.patch b/compute/patches/pg_hint_plan_v17.patch
new file mode 100644
index 000000000000..dbf4e470ea98
--- /dev/null
+++ b/compute/patches/pg_hint_plan_v17.patch
@@ -0,0 +1,174 @@
+diff --git a/expected/ut-A.out b/expected/ut-A.out
+index e7d68a1..65a056c 100644
+--- a/expected/ut-A.out
++++ b/expected/ut-A.out
+@@ -9,13 +9,16 @@ SET search_path TO public;
+ ----
+ -- No.A-1-1-3
+ CREATE EXTENSION pg_hint_plan;
++LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
+ -- No.A-1-2-3
+ DROP EXTENSION pg_hint_plan;
+ -- No.A-1-1-4
+ CREATE SCHEMA other_schema;
+ CREATE EXTENSION pg_hint_plan SCHEMA other_schema;
++LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
+ ERROR:  extension "pg_hint_plan" must be installed in schema "hint_plan"
+ CREATE EXTENSION pg_hint_plan;
++LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/pg_hint_plan
+ DROP SCHEMA other_schema;
+ ----
+ ---- No. A-5-1 comment pattern
+diff --git a/expected/ut-J.out b/expected/ut-J.out
+index 2fa3c70..314e929 100644
+--- a/expected/ut-J.out
++++ b/expected/ut-J.out
+@@ -789,38 +789,6 @@ NestLoop(st1 st2)
+ MergeJoin(t1 t2)
+ not used hint:
+ duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-NestLoop(st1 st2)
+-MergeJoin(t1 t2)
+-duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-NestLoop(st1 st2)
+-MergeJoin(t1 t2)
+-duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-NestLoop(st1 st2)
+-MergeJoin(t1 t2)
+-duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-NestLoop(st1 st2)
+-MergeJoin(t1 t2)
+-duplication hint:
+ error hint:
+ 
+                                        explain_filter                                        
+diff --git a/expected/ut-S.out b/expected/ut-S.out
+index 0bfcfb8..e75f581 100644
+--- a/expected/ut-S.out
++++ b/expected/ut-S.out
+@@ -4415,34 +4415,6 @@ used hint:
+ IndexScan(ti1 ti1_pred)
+ not used hint:
+ duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-IndexScan(ti1 ti1_pred)
+-duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-IndexScan(ti1 ti1_pred)
+-duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-IndexScan(ti1 ti1_pred)
+-duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-IndexScan(ti1 ti1_pred)
+-duplication hint:
+ error hint:
+ 
+                     explain_filter                     
+diff --git a/expected/ut-W.out b/expected/ut-W.out
+index a09bd34..0ad227c 100644
+--- a/expected/ut-W.out
++++ b/expected/ut-W.out
+@@ -1341,54 +1341,6 @@ IndexScan(ft1)
+ IndexScan(t)
+ Parallel(s1 3 hard)
+ duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-IndexScan(*VALUES*)
+-SeqScan(cte1)
+-IndexScan(ft1)
+-IndexScan(t)
+-Parallel(p1 5 hard)
+-Parallel(s1 3 hard)
+-duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-IndexScan(*VALUES*)
+-SeqScan(cte1)
+-IndexScan(ft1)
+-IndexScan(t)
+-Parallel(p1 5 hard)
+-Parallel(s1 3 hard)
+-duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-IndexScan(*VALUES*)
+-SeqScan(cte1)
+-IndexScan(ft1)
+-IndexScan(t)
+-Parallel(p1 5 hard)
+-Parallel(s1 3 hard)
+-duplication hint:
+-error hint:
+-
+-LOG:  pg_hint_plan:
+-used hint:
+-not used hint:
+-IndexScan(*VALUES*)
+-SeqScan(cte1)
+-IndexScan(ft1)
+-IndexScan(t)
+-Parallel(p1 5 hard)
+-Parallel(s1 3 hard)
+-duplication hint:
+ error hint:
+ 
+                     explain_filter                    
+diff --git a/expected/ut-fdw.out b/expected/ut-fdw.out
+index 017fa4b..98d989b 100644
+--- a/expected/ut-fdw.out
++++ b/expected/ut-fdw.out
+@@ -7,6 +7,7 @@ SET pg_hint_plan.debug_print TO on;
+ SET client_min_messages TO LOG;
+ SET pg_hint_plan.enable_hint TO on;
+ CREATE EXTENSION file_fdw;
++LOG:  Sending request to compute_ctl: http://localhost:3080/extension_server/file_fdw
+ CREATE SERVER file_server FOREIGN DATA WRAPPER file_fdw;
+ CREATE USER MAPPING FOR PUBLIC SERVER file_server;
+ CREATE FOREIGN TABLE ft1 (id int, val int) SERVER file_server OPTIONS (format 'csv', filename :'filename');
diff --git a/docker-compose/compute_wrapper/Dockerfile b/docker-compose/compute_wrapper/Dockerfile
index 8378f37b484c..05a2cf124cad 100644
--- a/docker-compose/compute_wrapper/Dockerfile
+++ b/docker-compose/compute_wrapper/Dockerfile
@@ -4,14 +4,16 @@ ARG TAG=latest
 
 FROM $REPOSITORY/${COMPUTE_IMAGE}:$TAG
 
+ARG COMPUTE_IMAGE
+
 USER root
 RUN apt-get update &&       \
     apt-get install -y curl \
                        jq   \
                        python3-pip \
-                       netcat
+                       netcat-openbsd
 #Faker is required for the pg_anon test
-RUN pip3 install Faker
+RUN case $COMPUTE_IMAGE in compute-node-v17) OPT="--break-system-packages";; *) OPT= ;; esac && pip3 install $OPT Faker
 #This is required for the pg_hintplan test
 RUN mkdir -p /ext-src/pg_hint_plan-src && chown postgres /ext-src/pg_hint_plan-src 
 
diff --git a/docker-compose/docker_compose_test.sh b/docker-compose/docker_compose_test.sh
index 10805a99522c..c97dfaa901e8 100755
--- a/docker-compose/docker_compose_test.sh
+++ b/docker-compose/docker_compose_test.sh
@@ -30,10 +30,17 @@ cleanup() {
     docker compose --profile test-extensions -f $COMPOSE_FILE down
 }
 
-for pg_version in 14 15 16; do
+for pg_version in ${TEST_VERSION_ONLY-14 15 16 17}; do
+    pg_version=${pg_version/v/}
     echo "clean up containers if exists"
     cleanup
-    PG_TEST_VERSION=$(($pg_version < 16 ? 16 : $pg_version))
+    PG_TEST_VERSION=$((pg_version < 16 ? 16 : pg_version))
+    # The support of pg_anon not yet added to PG17, so we have to remove the corresponding option
+    if [ $pg_version -eq 17 ]; then
+      SPEC_PATH="compute_wrapper/var/db/postgres/specs"
+      mv $SPEC_PATH/spec.json $SPEC_PATH/spec.bak
+      jq 'del(.cluster.settings[] | select (.name == "session_preload_libraries"))' $SPEC_PATH/spec.bak > $SPEC_PATH/spec.json
+    fi
     PG_VERSION=$pg_version PG_TEST_VERSION=$PG_TEST_VERSION docker compose --profile test-extensions -f $COMPOSE_FILE up --build -d
 
     echo "wait until the compute is ready. timeout after 60s. "
@@ -54,8 +61,7 @@ for pg_version in 14 15 16; do
         fi
     done
 
-    if [ $pg_version -ge 16 ]
-    then
+    if [ $pg_version -ge 16 ]; then
         echo Enabling trust connection
         docker exec $COMPUTE_CONTAINER_NAME bash -c "sed -i '\$d' /var/db/postgres/compute/pg_hba.conf && echo -e 'host\t all\t all\t all\t trust' >> /var/db/postgres/compute/pg_hba.conf && psql $PSQL_OPTION -c 'select pg_reload_conf()' "
         echo Adding postgres role
@@ -68,10 +74,13 @@ for pg_version in 14 15 16; do
         # The test assumes that it is running on the same host with the postgres engine.
         # In our case it's not true, that's why we are copying files to the compute node
         TMPDIR=$(mktemp -d)
-        docker cp $TEST_CONTAINER_NAME:/ext-src/pg_anon-src/data $TMPDIR/data
-        echo -e '1\t too \t many \t tabs' > $TMPDIR/data/bad.csv
-        docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/tmp/tmp_anon_alternate_data
+        # Add support for pg_anon for pg_v16
+        if [ $pg_version -ne 17 ]; then
+          docker cp $TEST_CONTAINER_NAME:/ext-src/pg_anon-src/data $TMPDIR/data
+          echo -e '1\t too \t many \t tabs' > $TMPDIR/data/bad.csv
+          docker cp $TMPDIR/data $COMPUTE_CONTAINER_NAME:/tmp/tmp_anon_alternate_data
         rm -rf $TMPDIR
+        fi
         TMPDIR=$(mktemp -d)
         # The following block does the same for the pg_hintplan test
         docker cp $TEST_CONTAINER_NAME:/ext-src/pg_hint_plan-src/data $TMPDIR/data
@@ -97,4 +106,8 @@ for pg_version in 14 15 16; do
         fi
     fi
     cleanup
+    # The support of pg_anon not yet added to PG17, so we have to remove the corresponding option
+    if [ $pg_version -eq 17 ]; then
+      mv $SPEC_PATH/spec.bak $SPEC_PATH/spec.json
+    fi
 done

From dcb24ce170573a2ae6ed29467669d03c73b589e6 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 3 Dec 2024 12:35:59 +0100
Subject: [PATCH 16/65] safekeeper,pageserver: add heap profiling (#9778)

## Problem

We don't have good observability for memory usage. This would be useful
e.g. to debug OOM incidents or optimize performance or resource usage.

We would also like to use continuous profiling with e.g. [Grafana Cloud
Profiles](https://grafana.com/products/cloud/profiles-for-continuous-profiling/)
(see https://github.com/neondatabase/cloud/issues/14888).

This PR is intended as a proof of concept, to try it out in staging and
drive further discussions about profiling more broadly.

Touches https://github.com/neondatabase/neon/issues/9534.
Touches https://github.com/neondatabase/cloud/issues/14888.
Depends on #9779.
Depends on #9780.

## Summary of changes

Adds a HTTP route `/profile/heap` that takes a heap profile and returns
it. Query parameters:

* `format`: output format (`jemalloc` or `pprof`; default `pprof`).

Unlike CPU profiles (see #9764), heap profiles are not symbolized and
require the original binary to translate addresses to function names. To
make this work with Grafana, we'll probably have to symbolize the
process server-side -- this is left as future work, as is other output
formats like SVG.

Heap profiles don't work on macOS due to limitations in jemalloc.
---
 Cargo.lock                        | 89 ++++++++++++++++++++++++-------
 Cargo.toml                        |  3 +-
 libs/utils/Cargo.toml             |  1 +
 libs/utils/src/http/endpoint.rs   | 64 ++++++++++++++++++++++
 pageserver/src/bin/pageserver.rs  |  5 ++
 pageserver/src/http/routes.rs     |  8 +--
 safekeeper/benches/receive_wal.rs |  6 +++
 safekeeper/src/bin/safekeeper.rs  |  5 ++
 safekeeper/src/http/routes.rs     |  7 ++-
 workspace_hack/Cargo.toml         | 15 ++++--
 10 files changed, 175 insertions(+), 28 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index ba02e3b11d40..b2769e59f082 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -301,7 +301,7 @@ dependencies = [
  "aws-smithy-types",
  "aws-types",
  "bytes",
- "fastrand 2.0.0",
+ "fastrand 2.2.0",
  "hex",
  "http 0.2.9",
  "hyper 0.14.30",
@@ -341,7 +341,7 @@ dependencies = [
  "aws-smithy-types",
  "aws-types",
  "bytes",
- "fastrand 2.0.0",
+ "fastrand 2.2.0",
  "http 0.2.9",
  "http-body 0.4.5",
  "once_cell",
@@ -417,7 +417,7 @@ dependencies = [
  "aws-smithy-xml",
  "aws-types",
  "bytes",
- "fastrand 2.0.0",
+ "fastrand 2.2.0",
  "hex",
  "hmac",
  "http 0.2.9",
@@ -621,7 +621,7 @@ dependencies = [
  "aws-smithy-runtime-api",
  "aws-smithy-types",
  "bytes",
- "fastrand 2.0.0",
+ "fastrand 2.2.0",
  "h2 0.3.26",
  "http 0.2.9",
  "http-body 0.4.5",
@@ -2054,9 +2054,9 @@ dependencies = [
 
 [[package]]
 name = "fastrand"
-version = "2.0.0"
+version = "2.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "6999dc1837253364c2ebb0704ba97994bd874e8f195d665c50b7548f6ea92764"
+checksum = "486f806e73c5707928240ddc295403b1b93c96a02038563881c4a2fd84b81ac4"
 
 [[package]]
 name = "ff"
@@ -2912,6 +2912,23 @@ version = "1.0.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b"
 
+[[package]]
+name = "jemalloc_pprof"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1a883828bd6a4b957cd9f618886ff19e5f3ebd34e06ba0e855849e049fef32fb"
+dependencies = [
+ "anyhow",
+ "libc",
+ "mappings",
+ "once_cell",
+ "pprof_util",
+ "tempfile",
+ "tikv-jemalloc-ctl",
+ "tokio",
+ "tracing",
+]
+
 [[package]]
 name = "jobserver"
 version = "0.1.32"
@@ -3022,9 +3039,9 @@ dependencies = [
 
 [[package]]
 name = "libc"
-version = "0.2.150"
+version = "0.2.167"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "89d92a4743f9a61002fae18374ed11e7973f530cb3a3255fb354818118b2203c"
+checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc"
 
 [[package]]
 name = "libloading"
@@ -3044,9 +3061,9 @@ checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058"
 
 [[package]]
 name = "linux-raw-sys"
-version = "0.4.13"
+version = "0.4.14"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01cda141df6706de531b6c46c3a33ecca755538219bd484262fa09410c13539c"
+checksum = "78b3ae25bc7c8c38cec158d1f2757ee79e9b3740fbc7ccf0e59e4b08d793fa89"
 
 [[package]]
 name = "linux-raw-sys"
@@ -3079,6 +3096,19 @@ dependencies = [
  "hashbrown 0.14.5",
 ]
 
+[[package]]
+name = "mappings"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ce9229c438fbf1c333926e2053c4c091feabbd40a1b590ec62710fea2384af9e"
+dependencies = [
+ "anyhow",
+ "libc",
+ "once_cell",
+ "pprof_util",
+ "tracing",
+]
+
 [[package]]
 name = "matchers"
 version = "0.1.0"
@@ -3346,6 +3376,7 @@ version = "0.4.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b05180d69e3da0e530ba2a1dae5110317e49e3b7f3d41be227dc5f92e49ee7af"
 dependencies = [
+ "num-bigint",
  "num-complex",
  "num-integer",
  "num-iter",
@@ -3434,6 +3465,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0"
 dependencies = [
  "autocfg",
+ "num-bigint",
  "num-integer",
  "num-traits",
 ]
@@ -3497,9 +3529,9 @@ dependencies = [
 
 [[package]]
 name = "once_cell"
-version = "1.18.0"
+version = "1.20.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "dd8b5dd2ae5ed71462c540258bedcb51965123ad7e7ccf4b9a8cafaa4a63576d"
+checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
 
 [[package]]
 name = "oorandom"
@@ -4298,6 +4330,19 @@ dependencies = [
  "thiserror",
 ]
 
+[[package]]
+name = "pprof_util"
+version = "0.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "65c568b3f8c1c37886ae07459b1946249e725c315306b03be5632f84c239f781"
+dependencies = [
+ "anyhow",
+ "flate2",
+ "num",
+ "paste",
+ "prost",
+]
+
 [[package]]
 name = "ppv-lite86"
 version = "0.2.17"
@@ -5220,14 +5265,14 @@ dependencies = [
 
 [[package]]
 name = "rustix"
-version = "0.38.28"
+version = "0.38.41"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "72e572a5e8ca657d7366229cdde4bd14c4eb5499a9573d4d366fe1b599daa316"
+checksum = "d7f649912bc1495e167a6edee79151c84b1bad49748cb4f1f1167f459f6224f6"
 dependencies = [
  "bitflags 2.4.1",
  "errno",
  "libc",
- "linux-raw-sys 0.4.13",
+ "linux-raw-sys 0.4.14",
  "windows-sys 0.52.0",
 ]
 
@@ -6251,13 +6296,13 @@ dependencies = [
 
 [[package]]
 name = "tempfile"
-version = "3.9.0"
+version = "3.14.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "01ce4141aa927a6d1bd34a041795abd0db1cccba5d5f24b009f694bdf3a1f3fa"
+checksum = "28cce251fcbc87fac86a866eeb0d6c2d536fc16d06f184bb61aeae11aa4cee0c"
 dependencies = [
  "cfg-if",
- "fastrand 2.0.0",
- "redox_syscall 0.4.1",
+ "fastrand 2.2.0",
+ "once_cell",
  "rustix",
  "windows-sys 0.52.0",
 ]
@@ -7058,6 +7103,7 @@ dependencies = [
  "hex-literal",
  "humantime",
  "hyper 0.14.30",
+ "jemalloc_pprof",
  "jsonwebtoken",
  "metrics",
  "nix 0.27.1",
@@ -7644,8 +7690,12 @@ dependencies = [
  "memchr",
  "nix 0.26.4",
  "nom",
+ "num",
  "num-bigint",
+ "num-complex",
  "num-integer",
+ "num-iter",
+ "num-rational",
  "num-traits",
  "once_cell",
  "parquet",
@@ -7669,6 +7719,7 @@ dependencies = [
  "subtle",
  "syn 2.0.90",
  "sync_wrapper 0.1.2",
+ "tikv-jemalloc-ctl",
  "tikv-jemalloc-sys",
  "time",
  "time-macros",
diff --git a/Cargo.toml b/Cargo.toml
index 036dc0105783..91fa6a260768 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -115,6 +115,7 @@ indoc = "2"
 ipnet = "2.10.0"
 itertools = "0.10"
 itoa = "1.0.11"
+jemalloc_pprof = "0.6"
 jsonwebtoken = "9"
 lasso = "0.7"
 libc = "0.2"
@@ -175,7 +176,7 @@ sync_wrapper = "0.1.2"
 tar = "0.4"
 test-context = "0.3"
 thiserror = "1.0"
-tikv-jemallocator = { version = "0.6", features = ["stats"] }
+tikv-jemallocator = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
 tikv-jemalloc-ctl = { version = "0.6", features = ["stats"] }
 tokio = { version = "1.17", features = ["macros"] }
 tokio-epoll-uring = { git = "https://github.com/neondatabase/tokio-epoll-uring.git" , branch = "main" }
diff --git a/libs/utils/Cargo.toml b/libs/utils/Cargo.toml
index 5648072a83c2..66500fb141bc 100644
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -26,6 +26,7 @@ humantime.workspace = true
 hyper0 = { workspace = true, features = ["full"] }
 fail.workspace = true
 futures = { workspace = true}
+jemalloc_pprof.workspace = true
 jsonwebtoken.workspace = true
 nix.workspace = true
 once_cell.workspace = true
diff --git a/libs/utils/src/http/endpoint.rs b/libs/utils/src/http/endpoint.rs
index 6a85f0ddeb26..d975b63677ac 100644
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -10,6 +10,7 @@ use metrics::{register_int_counter, Encoder, IntCounter, TextEncoder};
 use once_cell::sync::Lazy;
 use routerify::ext::RequestExt;
 use routerify::{Middleware, RequestInfo, Router, RouterBuilder};
+use tokio_util::io::ReaderStream;
 use tracing::{debug, info, info_span, warn, Instrument};
 
 use std::future::Future;
@@ -407,6 +408,69 @@ pub async fn profile_cpu_handler(req: Request<Body>) -> Result<Response<Body>, A
     }
 }
 
+/// Generates heap profiles.
+///
+/// This only works with jemalloc on Linux.
+pub async fn profile_heap_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    enum Format {
+        Jemalloc,
+        Pprof,
+    }
+
+    // Parameters.
+    let format = match get_query_param(&req, "format")?.as_deref() {
+        None => Format::Pprof,
+        Some("jemalloc") => Format::Jemalloc,
+        Some("pprof") => Format::Pprof,
+        Some(format) => return Err(ApiError::BadRequest(anyhow!("invalid format {format}"))),
+    };
+
+    // Obtain profiler handle.
+    let mut prof_ctl = jemalloc_pprof::PROF_CTL
+        .as_ref()
+        .ok_or(ApiError::InternalServerError(anyhow!(
+            "heap profiling not enabled"
+        )))?
+        .lock()
+        .await;
+    if !prof_ctl.activated() {
+        return Err(ApiError::InternalServerError(anyhow!(
+            "heap profiling not enabled"
+        )));
+    }
+
+    // Take and return the profile.
+    match format {
+        Format::Jemalloc => {
+            // NB: file is an open handle to a tempfile that's already deleted.
+            let file = tokio::task::spawn_blocking(move || prof_ctl.dump())
+                .await
+                .map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
+                .map_err(ApiError::InternalServerError)?;
+            let stream = ReaderStream::new(tokio::fs::File::from_std(file));
+            Response::builder()
+                .status(200)
+                .header(CONTENT_TYPE, "application/octet-stream")
+                .header(CONTENT_DISPOSITION, "attachment; filename=\"heap.dump\"")
+                .body(Body::wrap_stream(stream))
+                .map_err(|err| ApiError::InternalServerError(err.into()))
+        }
+
+        Format::Pprof => {
+            let data = tokio::task::spawn_blocking(move || prof_ctl.dump_pprof())
+                .await
+                .map_err(|join_err| ApiError::InternalServerError(join_err.into()))?
+                .map_err(ApiError::InternalServerError)?;
+            Response::builder()
+                .status(200)
+                .header(CONTENT_TYPE, "application/octet-stream")
+                .header(CONTENT_DISPOSITION, "attachment; filename=\"heap.pb\"")
+                .body(Body::from(data))
+                .map_err(|err| ApiError::InternalServerError(err.into()))
+        }
+    }
+}
+
 pub fn add_request_id_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
 ) -> Middleware<B, ApiError> {
     Middleware::pre(move |req| async move {
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 31f437085519..8fe225c6aa90 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -53,6 +53,11 @@ project_build_tag!(BUILD_TAG);
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 
+/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
+#[allow(non_upper_case_globals)]
+#[export_name = "malloc_conf"]
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
+
 const PID_FILE_NAME: &str = "pageserver.pid";
 
 const FEATURES: &[&str] = &[
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index ceb1c3b012f5..e127871549ea 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -56,9 +56,9 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::auth::JwtAuth;
 use utils::failpoint_support::failpoints_handler;
-use utils::http::endpoint::profile_cpu_handler;
-use utils::http::endpoint::prometheus_metrics_handler;
-use utils::http::endpoint::request_span;
+use utils::http::endpoint::{
+    profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span,
+};
 use utils::http::request::must_parse_query_param;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
 
@@ -155,6 +155,7 @@ impl State {
             "/swagger.yml",
             "/metrics",
             "/profile/cpu",
+            "/profile/heap",
         ];
         Ok(Self {
             conf,
@@ -3203,6 +3204,7 @@ pub fn make_router(
         .data(state)
         .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
         .get("/profile/cpu", |r| request_span(r, profile_cpu_handler))
+        .get("/profile/heap", |r| request_span(r, profile_heap_handler))
         .get("/v1/status", |r| api_handler(r, status_handler))
         .put("/v1/failpoints", |r| {
             testing_api_handler("manage failpoints", r, failpoints_handler)
diff --git a/safekeeper/benches/receive_wal.rs b/safekeeper/benches/receive_wal.rs
index 8c4281cf527e..313d945b942f 100644
--- a/safekeeper/benches/receive_wal.rs
+++ b/safekeeper/benches/receive_wal.rs
@@ -24,9 +24,15 @@ const KB: usize = 1024;
 const MB: usize = 1024 * KB;
 const GB: usize = 1024 * MB;
 
+/// Use jemalloc, and configure it to sample allocations for profiles every 1 MB.
+/// This mirrors the configuration in bin/safekeeper.rs.
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 
+#[allow(non_upper_case_globals)]
+#[export_name = "malloc_conf"]
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
+
 // Register benchmarks with Criterion.
 criterion_group!(
     name = benches;
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 3659bcd7e048..4dc7edef371f 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -51,6 +51,11 @@ use utils::{
 #[global_allocator]
 static GLOBAL: tikv_jemallocator::Jemalloc = tikv_jemallocator::Jemalloc;
 
+/// Configure jemalloc to sample allocations for profiles every 1 MB (1 << 20).
+#[allow(non_upper_case_globals)]
+#[export_name = "malloc_conf"]
+pub static malloc_conf: &[u8] = b"prof:true,prof_active:true,lg_prof_sample:20\0";
+
 const PID_FILE_NAME: &str = "safekeeper.pid";
 const ID_FILE_NAME: &str = "safekeeper.id";
 
diff --git a/safekeeper/src/http/routes.rs b/safekeeper/src/http/routes.rs
index 28294abdb929..69b775fd7673 100644
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -14,7 +14,8 @@ use tokio_util::sync::CancellationToken;
 use tracing::{info_span, Instrument};
 use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::{
-    profile_cpu_handler, prometheus_metrics_handler, request_span, ChannelWriter,
+    profile_cpu_handler, profile_heap_handler, prometheus_metrics_handler, request_span,
+    ChannelWriter,
 };
 use utils::http::request::parse_query_param;
 
@@ -573,7 +574,8 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
     let mut router = endpoint::make_router();
     if conf.http_auth.is_some() {
         router = router.middleware(auth_middleware(|request| {
-            const ALLOWLIST_ROUTES: &[&str] = &["/v1/status", "/metrics", "/profile/cpu"];
+            const ALLOWLIST_ROUTES: &[&str] =
+                &["/v1/status", "/metrics", "/profile/cpu", "profile/heap"];
             if ALLOWLIST_ROUTES.contains(&request.uri().path()) {
                 None
             } else {
@@ -594,6 +596,7 @@ pub fn make_router(conf: SafeKeeperConf) -> RouterBuilder<hyper::Body, ApiError>
         .data(auth)
         .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
         .get("/profile/cpu", |r| request_span(r, profile_cpu_handler))
+        .get("/profile/heap", |r| request_span(r, profile_heap_handler))
         .get("/v1/status", |r| request_span(r, status_handler))
         .put("/v1/failpoints", |r| {
             request_span(r, move |r| async {
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index c0a3abc3774a..d19379aefdf6 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -55,12 +55,16 @@ log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
 nix = { version = "0.26" }
 nom = { version = "7" }
+num = { version = "0.4" }
 num-bigint = { version = "0.4" }
+num-complex = { version = "0.4", default-features = false, features = ["std"] }
 num-integer = { version = "0.1", features = ["i128"] }
+num-iter = { version = "0.1", default-features = false, features = ["i128", "std"] }
+num-rational = { version = "0.4", default-features = false, features = ["num-bigint-std", "std"] }
 num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
 parquet = { version = "53", default-features = false, features = ["zstd"] }
-prost = { version = "0.13", features = ["prost-derive"] }
+prost = { version = "0.13", features = ["no-recursion-limit", "prost-derive"] }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }
@@ -76,7 +80,8 @@ smallvec = { version = "1", default-features = false, features = ["const_new", "
 spki = { version = "0.7", default-features = false, features = ["pem", "std"] }
 subtle = { version = "2" }
 sync_wrapper = { version = "0.1", default-features = false, features = ["futures"] }
-tikv-jemalloc-sys = { version = "0.6", features = ["stats"] }
+tikv-jemalloc-ctl = { version = "0.6", features = ["stats", "use_std"] }
+tikv-jemalloc-sys = { version = "0.6", features = ["profiling", "stats", "unprefixed_malloc_on_supported_platforms"] }
 time = { version = "0.3", features = ["macros", "serde-well-known"] }
 tokio = { version = "1", features = ["full", "test-util"] }
 tokio-rustls = { version = "0.26", default-features = false, features = ["logging", "ring", "tls12"] }
@@ -111,14 +116,18 @@ libc = { version = "0.2", features = ["extra_traits", "use_std"] }
 log = { version = "0.4", default-features = false, features = ["std"] }
 memchr = { version = "2" }
 nom = { version = "7" }
+num = { version = "0.4" }
 num-bigint = { version = "0.4" }
+num-complex = { version = "0.4", default-features = false, features = ["std"] }
 num-integer = { version = "0.1", features = ["i128"] }
+num-iter = { version = "0.1", default-features = false, features = ["i128", "std"] }
+num-rational = { version = "0.4", default-features = false, features = ["num-bigint-std", "std"] }
 num-traits = { version = "0.2", features = ["i128", "libm"] }
 once_cell = { version = "1" }
 parquet = { version = "53", default-features = false, features = ["zstd"] }
 prettyplease = { version = "0.2", default-features = false, features = ["verbatim"] }
 proc-macro2 = { version = "1" }
-prost = { version = "0.13", features = ["prost-derive"] }
+prost = { version = "0.13", features = ["no-recursion-limit", "prost-derive"] }
 quote = { version = "1" }
 regex = { version = "1" }
 regex-automata = { version = "0.4", default-features = false, features = ["dfa-onepass", "hybrid", "meta", "nfa-backtrack", "perf-inline", "perf-literal", "unicode"] }

From bbe4dfa99154b679371e0bdfa9d648d1ebdae2ee Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 3 Dec 2024 15:33:31 +0100
Subject: [PATCH 17/65] test_runner: use immediate shutdown in
 `test_sharded_ingest` (#9984)

## Problem

`test_sharded_ingest` ingests a lot of data, which can cause shutdown to
be slow e.g. due to local "S3 uploads" or compactions. This can cause
test flakes during teardown.

Resolves #9740.

## Summary of changes

Perform an immediate shutdown of the cluster.
---
 test_runner/performance/test_sharded_ingest.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/test_runner/performance/test_sharded_ingest.py b/test_runner/performance/test_sharded_ingest.py
index 4c21e799c8e0..94fd54bade6a 100644
--- a/test_runner/performance/test_sharded_ingest.py
+++ b/test_runner/performance/test_sharded_ingest.py
@@ -90,6 +90,7 @@ def test_sharded_ingest(
     # Start the endpoint.
     endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
     start_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
+
     # Ingest data and measure WAL volume and duration.
     with closing(endpoint.connect()) as conn:
         with conn.cursor() as cur:
@@ -104,6 +105,8 @@ def test_sharded_ingest(
                 wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
 
     end_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
+
+    # Record metrics.
     wal_written_mb = round((end_lsn - start_lsn) / (1024 * 1024))
     zenbenchmark.record("wal_written", wal_written_mb, "MB", MetricReport.TEST_PARAM)
 
@@ -152,3 +155,7 @@ def test_sharded_ingest(
     log.info(f"WAL ingested by each pageserver {ingested_by_ps}")
 
     assert tenant_get_shards(env, tenant_id) == shards, "shards moved"
+
+    # The pageservers can take a long time to shut down gracefully, presumably due to the upload
+    # queue or compactions or something. Just stop them immediately, we don't care.
+    env.stop(immediate=True)

From 4d422b937c40722f1f19373ade3fcba976cb96a0 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 3 Dec 2024 16:25:58 +0100
Subject: [PATCH 18/65] pageserver: only throttle pagestream requests & bring
 back throttling deduction for smgr latency metrics (#9962)

## Problem

In the batching PR
- https://github.com/neondatabase/neon/pull/9870

I stopped deducting the time-spent-in-throttle fro latency metrics,
i.e.,
- smgr latency metrics (`SmgrOpTimer`)
- basebackup latency (+scan latency, which I think is part of
basebackup).

The reason for stopping the deduction was that with the introduction of
batching, the trick with tracking time-spent-in-throttle inside
RequestContext and swap-replacing it from the `impl Drop for
SmgrOpTimer` no longer worked with >1 requests in a batch.

However, deducting time-spent-in-throttle is desirable because our
internal latency SLO definition does not account for throttling.

## Summary of changes

- Redefine throttling to be a page_service pagestream request throttle
instead of a throttle for repository `Key` reads through `Timeline::get`
/ `Timeline::get_vectored`.
- This means reads done by `basebackup` are no longer subject to any
throttle.
- The throttle applies after batching, before handling of the request.
- Drive-by fix: make throttle sensitive to cancellation.
- Rename metric label `kind` from `timeline_get` to `pagestream` to
reflect the new scope of throttling.

To avoid config format breakage, we leave the config field named
`timeline_get_throttle` and ignore the `task_kinds` field.
This will be cleaned up in a future PR.

## Trade-Offs

Ideally, we would apply the throttle before reading a request off the
connection, so that we queue the minimal amount of work inside the
process.
However, that's not possible because we need to do shard routing.

The redefinition of the throttle to limit pagestream request rate
instead of repository `Key` rate comes with several downsides:
- We're no longer able to use the throttle mechanism for other other
tasks, e.g. image layer creation.
  However, in practice, we never used that capability anyways.
- We no longer throttle basebackup.
---
 libs/pageserver_api/src/models.rs             | 58 ++++++++++-
 pageserver/src/metrics.rs                     | 95 ++++++++++++-------
 pageserver/src/page_service.rs                | 45 ++++++++-
 pageserver/src/tenant.rs                      | 20 ++--
 pageserver/src/tenant/tasks.rs                |  6 +-
 pageserver/src/tenant/throttle.rs             | 33 ++-----
 pageserver/src/tenant/timeline.rs             | 54 ++---------
 pageserver/src/tenant/timeline/delete.rs      |  2 +-
 .../test_pageserver_getpage_throttle.py       | 16 ++--
 9 files changed, 198 insertions(+), 131 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 42c5d10c053b..5488f7b2c29b 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -501,7 +501,9 @@ pub struct EvictionPolicyLayerAccessThreshold {
 
 #[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
 pub struct ThrottleConfig {
-    pub task_kinds: Vec<String>, // TaskKind
+    /// See [`ThrottleConfigTaskKinds`] for why we do the serde `rename`.
+    #[serde(rename = "task_kinds")]
+    pub enabled: ThrottleConfigTaskKinds,
     pub initial: u32,
     #[serde(with = "humantime_serde")]
     pub refill_interval: Duration,
@@ -509,10 +511,38 @@ pub struct ThrottleConfig {
     pub max: u32,
 }
 
+/// Before <https://github.com/neondatabase/neon/pull/9962>
+/// the throttle was a per `Timeline::get`/`Timeline::get_vectored` call.
+/// The `task_kinds` field controlled which Pageserver "Task Kind"s
+/// were subject to the throttle.
+///
+/// After that PR, the throttle is applied at pagestream request level
+/// and the `task_kinds` field does not apply since the only task kind
+/// that us subject to the throttle is that of the page service.
+///
+/// However, we don't want to make a breaking config change right now
+/// because it means we have to migrate all the tenant configs.
+/// This will be done in a future PR.
+///
+/// In the meantime, we use emptiness / non-emptsiness of the `task_kinds`
+/// field to determine if the throttle is enabled or not.
+#[derive(Serialize, Deserialize, Debug, Clone, PartialEq, Eq)]
+#[serde(transparent)]
+pub struct ThrottleConfigTaskKinds(Vec<String>);
+
+impl ThrottleConfigTaskKinds {
+    pub fn disabled() -> Self {
+        Self(vec![])
+    }
+    pub fn is_enabled(&self) -> bool {
+        !self.0.is_empty()
+    }
+}
+
 impl ThrottleConfig {
     pub fn disabled() -> Self {
         Self {
-            task_kinds: vec![], // effectively disables the throttle
+            enabled: ThrottleConfigTaskKinds::disabled(),
             // other values don't matter with emtpy `task_kinds`.
             initial: 0,
             refill_interval: Duration::from_millis(1),
@@ -526,6 +556,30 @@ impl ThrottleConfig {
     }
 }
 
+#[cfg(test)]
+mod throttle_config_tests {
+    use super::*;
+
+    #[test]
+    fn test_disabled_is_disabled() {
+        let config = ThrottleConfig::disabled();
+        assert!(!config.enabled.is_enabled());
+    }
+    #[test]
+    fn test_enabled_backwards_compat() {
+        let input = serde_json::json!({
+            "task_kinds": ["PageRequestHandler"],
+            "initial": 40000,
+            "refill_interval": "50ms",
+            "refill_amount": 1000,
+            "max": 40000,
+            "fair": true
+        });
+        let config: ThrottleConfig = serde_json::from_value(input).unwrap();
+        assert!(config.enabled.is_enabled());
+    }
+}
+
 /// A flattened analog of a `pagesever::tenant::LocationMode`, which
 /// lists out all possible states (and the virtual "Detached" state)
 /// in a flat form rather than using rust-style enums.
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index d04fae762770..998c15ccaf2c 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -217,31 +217,16 @@ impl<'a> ScanLatencyOngoingRecording<'a> {
         ScanLatencyOngoingRecording { parent, start }
     }
 
-    pub(crate) fn observe(self, throttled: Option<Duration>) {
+    pub(crate) fn observe(self) {
         let elapsed = self.start.elapsed();
-        let ex_throttled = if let Some(throttled) = throttled {
-            elapsed.checked_sub(throttled)
-        } else {
-            Some(elapsed)
-        };
-        if let Some(ex_throttled) = ex_throttled {
-            self.parent.observe(ex_throttled.as_secs_f64());
-        } else {
-            use utils::rate_limit::RateLimit;
-            static LOGGED: Lazy<Mutex<RateLimit>> =
-                Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
-            let mut rate_limit = LOGGED.lock().unwrap();
-            rate_limit.call(|| {
-                warn!("error deducting time spent throttled; this message is logged at a global rate limit");
-            });
-        }
+        self.parent.observe(elapsed.as_secs_f64());
     }
 }
 
 pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(|| {
     let inner = register_histogram_vec!(
         "pageserver_get_vectored_seconds",
-        "Time spent in get_vectored, excluding time spent in timeline_get_throttle.",
+        "Time spent in get_vectored.",
         &["task_kind"],
         CRITICAL_OP_BUCKETS.into(),
     )
@@ -264,7 +249,7 @@ pub(crate) static GET_VECTORED_LATENCY: Lazy<GetVectoredLatency> = Lazy::new(||
 pub(crate) static SCAN_LATENCY: Lazy<ScanLatency> = Lazy::new(|| {
     let inner = register_histogram_vec!(
         "pageserver_scan_seconds",
-        "Time spent in scan, excluding time spent in timeline_get_throttle.",
+        "Time spent in scan.",
         &["task_kind"],
         CRITICAL_OP_BUCKETS.into(),
     )
@@ -1227,11 +1212,44 @@ pub(crate) struct SmgrOpTimer {
     per_timeline_latency_histo: Option<Histogram>,
 
     start: Instant,
+    throttled: Duration,
+    op: SmgrQueryType,
+}
+
+impl SmgrOpTimer {
+    pub(crate) fn deduct_throttle(&mut self, throttle: &Option<Duration>) {
+        let Some(throttle) = throttle else {
+            return;
+        };
+        self.throttled += *throttle;
+    }
 }
 
 impl Drop for SmgrOpTimer {
     fn drop(&mut self) {
-        let elapsed = self.start.elapsed().as_secs_f64();
+        let elapsed = self.start.elapsed();
+
+        let elapsed = match elapsed.checked_sub(self.throttled) {
+            Some(elapsed) => elapsed,
+            None => {
+                use utils::rate_limit::RateLimit;
+                static LOGGED: Lazy<Mutex<enum_map::EnumMap<SmgrQueryType, RateLimit>>> =
+                    Lazy::new(|| {
+                        Mutex::new(enum_map::EnumMap::from_array(std::array::from_fn(|_| {
+                            RateLimit::new(Duration::from_secs(10))
+                        })))
+                    });
+                let mut guard = LOGGED.lock().unwrap();
+                let rate_limit = &mut guard[self.op];
+                rate_limit.call(|| {
+                    warn!(op=?self.op, ?elapsed, ?self.throttled, "implementation error: time spent throttled exceeds total request wall clock time");
+                });
+                elapsed // un-throttled time, more info than just saturating to 0
+            }
+        };
+
+        let elapsed = elapsed.as_secs_f64();
+
         self.global_latency_histo.observe(elapsed);
         if let Some(per_timeline_getpage_histo) = &self.per_timeline_latency_histo {
             per_timeline_getpage_histo.observe(elapsed);
@@ -1491,6 +1509,8 @@ impl SmgrQueryTimePerTimeline {
             global_latency_histo: self.global_latency[op as usize].clone(),
             per_timeline_latency_histo,
             start: started_at,
+            op,
+            throttled: Duration::ZERO,
         }
     }
 
@@ -3299,7 +3319,7 @@ pub(crate) mod tenant_throttling {
     use once_cell::sync::Lazy;
     use utils::shard::TenantShardId;
 
-    use crate::tenant::{self, throttle::Metric};
+    use crate::tenant::{self};
 
     struct GlobalAndPerTenantIntCounter {
         global: IntCounter,
@@ -3318,7 +3338,7 @@ pub(crate) mod tenant_throttling {
         }
     }
 
-    pub(crate) struct TimelineGet {
+    pub(crate) struct Metrics<const KIND: usize> {
         count_accounted_start: GlobalAndPerTenantIntCounter,
         count_accounted_finish: GlobalAndPerTenantIntCounter,
         wait_time: GlobalAndPerTenantIntCounter,
@@ -3391,40 +3411,41 @@ pub(crate) mod tenant_throttling {
         .unwrap()
     });
 
-    const KIND: &str = "timeline_get";
+    const KINDS: &[&str] = &["pagestream"];
+    pub type Pagestream = Metrics<0>;
 
-    impl TimelineGet {
+    impl<const KIND: usize> Metrics<KIND> {
         pub(crate) fn new(tenant_shard_id: &TenantShardId) -> Self {
             let per_tenant_label_values = &[
-                KIND,
+                KINDS[KIND],
                 &tenant_shard_id.tenant_id.to_string(),
                 &tenant_shard_id.shard_slug().to_string(),
             ];
-            TimelineGet {
+            Metrics {
                 count_accounted_start: {
                     GlobalAndPerTenantIntCounter {
-                        global: COUNT_ACCOUNTED_START.with_label_values(&[KIND]),
+                        global: COUNT_ACCOUNTED_START.with_label_values(&[KINDS[KIND]]),
                         per_tenant: COUNT_ACCOUNTED_START_PER_TENANT
                             .with_label_values(per_tenant_label_values),
                     }
                 },
                 count_accounted_finish: {
                     GlobalAndPerTenantIntCounter {
-                        global: COUNT_ACCOUNTED_FINISH.with_label_values(&[KIND]),
+                        global: COUNT_ACCOUNTED_FINISH.with_label_values(&[KINDS[KIND]]),
                         per_tenant: COUNT_ACCOUNTED_FINISH_PER_TENANT
                             .with_label_values(per_tenant_label_values),
                     }
                 },
                 wait_time: {
                     GlobalAndPerTenantIntCounter {
-                        global: WAIT_USECS.with_label_values(&[KIND]),
+                        global: WAIT_USECS.with_label_values(&[KINDS[KIND]]),
                         per_tenant: WAIT_USECS_PER_TENANT
                             .with_label_values(per_tenant_label_values),
                     }
                 },
                 count_throttled: {
                     GlobalAndPerTenantIntCounter {
-                        global: WAIT_COUNT.with_label_values(&[KIND]),
+                        global: WAIT_COUNT.with_label_values(&[KINDS[KIND]]),
                         per_tenant: WAIT_COUNT_PER_TENANT
                             .with_label_values(per_tenant_label_values),
                     }
@@ -3447,15 +3468,17 @@ pub(crate) mod tenant_throttling {
             &WAIT_USECS_PER_TENANT,
             &WAIT_COUNT_PER_TENANT,
         ] {
-            let _ = m.remove_label_values(&[
-                KIND,
-                &tenant_shard_id.tenant_id.to_string(),
-                &tenant_shard_id.shard_slug().to_string(),
-            ]);
+            for kind in KINDS {
+                let _ = m.remove_label_values(&[
+                    kind,
+                    &tenant_shard_id.tenant_id.to_string(),
+                    &tenant_shard_id.shard_slug().to_string(),
+                ]);
+            }
         }
     }
 
-    impl Metric for TimelineGet {
+    impl<const KIND: usize> tenant::throttle::Metric for Metrics<KIND> {
         #[inline(always)]
         fn accounting_start(&self) {
             self.count_accounted_start.inc();
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 64842aa5b886..7026df952751 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -574,6 +574,41 @@ enum BatchedFeMessage {
     },
 }
 
+impl BatchedFeMessage {
+    async fn throttle(&mut self, cancel: &CancellationToken) -> Result<(), QueryError> {
+        let (shard, tokens, timers) = match self {
+            BatchedFeMessage::Exists { shard, timer, .. }
+            | BatchedFeMessage::Nblocks { shard, timer, .. }
+            | BatchedFeMessage::DbSize { shard, timer, .. }
+            | BatchedFeMessage::GetSlruSegment { shard, timer, .. } => {
+                (
+                    shard,
+                    // 1 token is probably under-estimating because these
+                    // request handlers typically do several Timeline::get calls.
+                    1,
+                    itertools::Either::Left(std::iter::once(timer)),
+                )
+            }
+            BatchedFeMessage::GetPage { shard, pages, .. } => (
+                shard,
+                pages.len(),
+                itertools::Either::Right(pages.iter_mut().map(|(_, _, timer)| timer)),
+            ),
+            BatchedFeMessage::RespondError { .. } => return Ok(()),
+        };
+        let throttled = tokio::select! {
+            throttled = shard.pagestream_throttle.throttle(tokens) => { throttled }
+            _ = cancel.cancelled() => {
+                return Err(QueryError::Shutdown);
+            }
+        };
+        for timer in timers {
+            timer.deduct_throttle(&throttled);
+        }
+        Ok(())
+    }
+}
+
 impl PageServerHandler {
     pub fn new(
         tenant_manager: Arc<TenantManager>,
@@ -1157,13 +1192,18 @@ impl PageServerHandler {
                 Ok(msg) => msg,
                 Err(e) => break e,
             };
-            let msg = match msg {
+            let mut msg = match msg {
                 Some(msg) => msg,
                 None => {
                     debug!("pagestream subprotocol end observed");
                     return ((pgb_reader, timeline_handles), Ok(()));
                 }
             };
+
+            if let Err(cancelled) = msg.throttle(&self.cancel).await {
+                break cancelled;
+            }
+
             let err = self
                 .pagesteam_handle_batched_message(pgb_writer, msg, &cancel, ctx)
                 .await;
@@ -1321,12 +1361,13 @@ impl PageServerHandler {
                             return Ok(());
                         }
                     };
-                    let batch = match batch {
+                    let mut batch = match batch {
                         Ok(batch) => batch,
                         Err(e) => {
                             return Err(e);
                         }
                     };
+                    batch.throttle(&self.cancel).await?;
                     self.pagesteam_handle_batched_message(pgb_writer, batch, &cancel, &ctx)
                         .await?;
                 }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index cd0690bb1a57..ada5c4a97705 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -357,8 +357,8 @@ pub struct Tenant {
 
     /// Throttle applied at the top of [`Timeline::get`].
     /// All [`Tenant::timelines`] of a given [`Tenant`] instance share the same [`throttle::Throttle`] instance.
-    pub(crate) timeline_get_throttle:
-        Arc<throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
+    pub(crate) pagestream_throttle:
+        Arc<throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,
 
     /// An ongoing timeline detach concurrency limiter.
     ///
@@ -1678,7 +1678,7 @@ impl Tenant {
                     remote_metadata,
                     TimelineResources {
                         remote_client,
-                        timeline_get_throttle: self.timeline_get_throttle.clone(),
+                        pagestream_throttle: self.pagestream_throttle.clone(),
                         l0_flush_global_state: self.l0_flush_global_state.clone(),
                     },
                     LoadTimelineCause::Attach,
@@ -3835,7 +3835,7 @@ impl Tenant {
         }
     }
 
-    fn get_timeline_get_throttle_config(
+    fn get_pagestream_throttle_config(
         psconf: &'static PageServerConf,
         overrides: &TenantConfOpt,
     ) -> throttle::Config {
@@ -3846,8 +3846,8 @@ impl Tenant {
     }
 
     pub(crate) fn tenant_conf_updated(&self, new_conf: &TenantConfOpt) {
-        let conf = Self::get_timeline_get_throttle_config(self.conf, new_conf);
-        self.timeline_get_throttle.reconfigure(conf)
+        let conf = Self::get_pagestream_throttle_config(self.conf, new_conf);
+        self.pagestream_throttle.reconfigure(conf)
     }
 
     /// Helper function to create a new Timeline struct.
@@ -4009,9 +4009,9 @@ impl Tenant {
             attach_wal_lag_cooldown: Arc::new(std::sync::OnceLock::new()),
             cancel: CancellationToken::default(),
             gate: Gate::default(),
-            timeline_get_throttle: Arc::new(throttle::Throttle::new(
-                Tenant::get_timeline_get_throttle_config(conf, &attached_conf.tenant_conf),
-                crate::metrics::tenant_throttling::TimelineGet::new(&tenant_shard_id),
+            pagestream_throttle: Arc::new(throttle::Throttle::new(
+                Tenant::get_pagestream_throttle_config(conf, &attached_conf.tenant_conf),
+                crate::metrics::tenant_throttling::Metrics::new(&tenant_shard_id),
             )),
             tenant_conf: Arc::new(ArcSwap::from_pointee(attached_conf)),
             ongoing_timeline_detach: std::sync::Mutex::default(),
@@ -4909,7 +4909,7 @@ impl Tenant {
     fn build_timeline_resources(&self, timeline_id: TimelineId) -> TimelineResources {
         TimelineResources {
             remote_client: self.build_timeline_remote_client(timeline_id),
-            timeline_get_throttle: self.timeline_get_throttle.clone(),
+            pagestream_throttle: self.pagestream_throttle.clone(),
             l0_flush_global_state: self.l0_flush_global_state.clone(),
         }
     }
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 16dac10dca22..0118a5ce5f9d 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -471,14 +471,14 @@ async fn ingest_housekeeping_loop(tenant: Arc<Tenant>, cancel: CancellationToken
 
             // TODO: rename the background loop kind to something more generic, like, tenant housekeeping.
             // Or just spawn another background loop for this throttle, it's not like it's super costly.
-            info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
+            info_span!(parent: None, "pagestream_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
                 let now = Instant::now();
                 let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
-                let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.timeline_get_throttle.reset_stats();
+                let Stats { count_accounted_start, count_accounted_finish, count_throttled, sum_throttled_usecs} = tenant.pagestream_throttle.reset_stats();
                 if count_throttled == 0 {
                     return;
                 }
-                let allowed_rps = tenant.timeline_get_throttle.steady_rps();
+                let allowed_rps = tenant.pagestream_throttle.steady_rps();
                 let delta = now - prev;
                 info!(
                     n_seconds=%format_args!("{:.3}", delta.as_secs_f64()),
diff --git a/pageserver/src/tenant/throttle.rs b/pageserver/src/tenant/throttle.rs
index 7c4de55a476e..54c0e59daaf6 100644
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -1,5 +1,4 @@
 use std::{
-    str::FromStr,
     sync::{
         atomic::{AtomicU64, Ordering},
         Arc,
@@ -8,12 +7,8 @@ use std::{
 };
 
 use arc_swap::ArcSwap;
-use enumset::EnumSet;
-use tracing::error;
 use utils::leaky_bucket::{LeakyBucketConfig, RateLimiter};
 
-use crate::{context::RequestContext, task_mgr::TaskKind};
-
 /// Throttle for `async` functions.
 ///
 /// Runtime reconfigurable.
@@ -35,7 +30,7 @@ pub struct Throttle<M: Metric> {
 }
 
 pub struct Inner {
-    task_kinds: EnumSet<TaskKind>,
+    enabled: bool,
     rate_limiter: Arc<RateLimiter>,
 }
 
@@ -79,26 +74,12 @@ where
     }
     fn new_inner(config: Config) -> Inner {
         let Config {
-            task_kinds,
+            enabled,
             initial,
             refill_interval,
             refill_amount,
             max,
         } = config;
-        let task_kinds: EnumSet<TaskKind> = task_kinds
-            .iter()
-            .filter_map(|s| match TaskKind::from_str(s) {
-                Ok(v) => Some(v),
-                Err(e) => {
-                    // TODO: avoid this failure mode
-                    error!(
-                        "cannot parse task kind, ignoring for rate limiting {}",
-                        utils::error::report_compact_sources(&e)
-                    );
-                    None
-                }
-            })
-            .collect();
 
         // steady rate, we expect `refill_amount` requests per `refill_interval`.
         // dividing gives us the rps.
@@ -112,7 +93,7 @@ where
         let rate_limiter = RateLimiter::with_initial_tokens(config, f64::from(initial_tokens));
 
         Inner {
-            task_kinds,
+            enabled: enabled.is_enabled(),
             rate_limiter: Arc::new(rate_limiter),
         }
     }
@@ -141,11 +122,13 @@ where
         self.inner.load().rate_limiter.steady_rps()
     }
 
-    pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) -> Option<Duration> {
+    pub async fn throttle(&self, key_count: usize) -> Option<Duration> {
         let inner = self.inner.load_full(); // clones the `Inner` Arc
-        if !inner.task_kinds.contains(ctx.task_kind()) {
+
+        if !inner.enabled {
             return None;
-        };
+        }
+
         let start = std::time::Instant::now();
 
         self.metric.accounting_start();
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index dc3f823f20c8..1414bef0a5a2 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -208,8 +208,8 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
 /// The outward-facing resources required to build a Timeline
 pub struct TimelineResources {
     pub remote_client: RemoteTimelineClient,
-    pub timeline_get_throttle:
-        Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
+    pub pagestream_throttle:
+        Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,
     pub l0_flush_global_state: l0_flush::L0FlushGlobalState,
 }
 
@@ -411,9 +411,9 @@ pub struct Timeline {
     /// Timeline deletion will acquire both compaction and gc locks in whatever order.
     gc_lock: tokio::sync::Mutex<()>,
 
-    /// Cloned from [`super::Tenant::timeline_get_throttle`] on construction.
-    timeline_get_throttle:
-        Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::TimelineGet>>,
+    /// Cloned from [`super::Tenant::pagestream_throttle`] on construction.
+    pub(crate) pagestream_throttle:
+        Arc<crate::tenant::throttle::Throttle<crate::metrics::tenant_throttling::Pagestream>>,
 
     /// Size estimator for aux file v2
     pub(crate) aux_file_size_estimator: AuxFileSizeEstimator,
@@ -949,7 +949,7 @@ impl Timeline {
     /// If a remote layer file is needed, it is downloaded as part of this
     /// call.
     ///
-    /// This method enforces [`Self::timeline_get_throttle`] internally.
+    /// This method enforces [`Self::pagestream_throttle`] internally.
     ///
     /// NOTE: It is considered an error to 'get' a key that doesn't exist. The
     /// abstraction above this needs to store suitable metadata to track what
@@ -977,8 +977,6 @@ impl Timeline {
         // page_service.
         debug_assert!(!self.shard_identity.is_key_disposable(&key));
 
-        self.timeline_get_throttle.throttle(ctx, 1).await;
-
         let keyspace = KeySpace {
             ranges: vec![key..key.next()],
         };
@@ -1058,14 +1056,6 @@ impl Timeline {
             .for_task_kind(ctx.task_kind())
             .map(|metric| (metric, Instant::now()));
 
-        // start counting after throttle so that throttle time
-        // is always less than observation time and we don't
-        // underflow when computing `ex_throttled` below.
-        let throttled = self
-            .timeline_get_throttle
-            .throttle(ctx, key_count as usize)
-            .await;
-
         let res = self
             .get_vectored_impl(
                 keyspace.clone(),
@@ -1077,23 +1067,7 @@ impl Timeline {
 
         if let Some((metric, start)) = start {
             let elapsed = start.elapsed();
-            let ex_throttled = if let Some(throttled) = throttled {
-                elapsed.checked_sub(throttled)
-            } else {
-                Some(elapsed)
-            };
-
-            if let Some(ex_throttled) = ex_throttled {
-                metric.observe(ex_throttled.as_secs_f64());
-            } else {
-                use utils::rate_limit::RateLimit;
-                static LOGGED: Lazy<Mutex<RateLimit>> =
-                    Lazy::new(|| Mutex::new(RateLimit::new(Duration::from_secs(10))));
-                let mut rate_limit = LOGGED.lock().unwrap();
-                rate_limit.call(|| {
-                    warn!("error deducting time spent throttled; this message is logged at a global rate limit");
-                });
-            }
+            metric.observe(elapsed.as_secs_f64());
         }
 
         res
@@ -1138,16 +1112,6 @@ impl Timeline {
             .for_task_kind(ctx.task_kind())
             .map(ScanLatencyOngoingRecording::start_recording);
 
-        // start counting after throttle so that throttle time
-        // is always less than observation time and we don't
-        // underflow when computing the `ex_throttled` value in
-        // `recording.observe(throttled)` below.
-        let throttled = self
-            .timeline_get_throttle
-            // assume scan = 1 quota for now until we find a better way to process this
-            .throttle(ctx, 1)
-            .await;
-
         let vectored_res = self
             .get_vectored_impl(
                 keyspace.clone(),
@@ -1158,7 +1122,7 @@ impl Timeline {
             .await;
 
         if let Some(recording) = start {
-            recording.observe(throttled);
+            recording.observe();
         }
 
         vectored_res
@@ -2374,7 +2338,7 @@ impl Timeline {
 
                 standby_horizon: AtomicLsn::new(0),
 
-                timeline_get_throttle: resources.timeline_get_throttle,
+                pagestream_throttle: resources.pagestream_throttle,
 
                 aux_file_size_estimator: AuxFileSizeEstimator::new(aux_file_metrics),
 
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 67fc710c44ee..47a93b19d270 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -298,7 +298,7 @@ impl DeleteTimelineFlow {
                 None, // Ancestor is not needed for deletion.
                 TimelineResources {
                     remote_client,
-                    timeline_get_throttle: tenant.timeline_get_throttle.clone(),
+                    pagestream_throttle: tenant.pagestream_throttle.clone(),
                     l0_flush_global_state: tenant.l0_flush_global_state.clone(),
                 },
                 // Important. We dont pass ancestor above because it can be missing.
diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py
index 62aec50a9ed3..6d0661f068ca 100644
--- a/test_runner/regress/test_pageserver_getpage_throttle.py
+++ b/test_runner/regress/test_pageserver_getpage_throttle.py
@@ -33,7 +33,9 @@ def test_pageserver_getpage_throttle(neon_env_builder: NeonEnvBuilder, pg_bin: P
         conf={
             "compaction_period": f"{compaction_period}s",
             "timeline_get_throttle": {
-                "task_kinds": ["PageRequestHandler"],
+                "task_kinds": [
+                    "PageRequestHandler"
+                ],  # any non-empty array will do here https://github.com/neondatabase/neon/pull/9962
                 "initial": 0,
                 "refill_interval": "100ms",
                 "refill_amount": int(rate_limit_rps / 10),
@@ -116,7 +118,6 @@ def run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs: i
         timeout=compaction_period,
     )
 
-    log.info("the smgr metric includes throttle time")
     smgr_query_seconds_post = ps_http.get_metric_value(smgr_metric_name, smgr_metrics_query)
     assert smgr_query_seconds_post is not None
     throttled_usecs_post = ps_http.get_metric_value(throttle_metric_name, throttle_metrics_query)
@@ -125,13 +126,14 @@ def run_pagebench_at_max_speed_and_get_total_requests_completed(duration_secs: i
     actual_throttled_usecs = throttled_usecs_post - throttled_usecs_pre
     actual_throttled_secs = actual_throttled_usecs / 1_000_000
 
+    log.info("validate that the metric doesn't include throttle wait time")
     assert (
-        pytest.approx(duration_secs, 0.1) == actual_smgr_query_seconds
-    ), "smgr metrics include throttle wait time"
-    smgr_ex_throttle = actual_smgr_query_seconds - actual_throttled_secs
-    assert smgr_ex_throttle > 0
+        duration_secs >= 10 * actual_smgr_query_seconds
+    ), "smgr metrics should not include throttle wait time"
+
+    log.info("validate that the throttling wait time metrics is correct")
     assert (
-        duration_secs > 10 * smgr_ex_throttle
+        pytest.approx(actual_throttled_secs + actual_smgr_query_seconds, 0.1) == duration_secs
     ), "most of the time in this test is spent throttled because the rate-limit's contribution to latency dominates"
 
 
From 71d004289c0e9b62a3be96939a8b5defa8f98065 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 3 Dec 2024 16:55:00 +0000
Subject: [PATCH 19/65] storcon: in shard splits, inherit parent's AZ (#9946)

## Problem

Sharded tenants should be run in a single AZ for best performance, so
that computes have AZ-local latency to all the shards.

Part of https://github.com/neondatabase/neon/issues/8264

## Summary of changes

- When we split a tenant, instead of updating each shard's preferred AZ
to wherever it is scheduled, propagate the preferred AZ from the parent.
- Drop the check in `test_shard_preferred_azs` that asserts shards end
up in their preferred AZ: this will not be true again until the
optimize_attachment logic is updated to make this so. The existing check
wasn't testing anything about scheduling, it was just asserting that we
set preferred AZ in a way that matches the way things happen to be
scheduled at time of split.
---
 storage_controller/src/service.rs             | 68 ++++++-------------
 .../regress/test_storage_controller.py        |  6 +-
 2 files changed, 24 insertions(+), 50 deletions(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 52c9c4710d89..741d3dc2b4da 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -44,12 +44,12 @@ use futures::{stream::FuturesUnordered, StreamExt};
 use itertools::Itertools;
 use pageserver_api::{
     controller_api::{
-        MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability, NodeRegisterRequest,
-        NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy, ShardSchedulingPolicy,
-        ShardsPreferredAzsRequest, ShardsPreferredAzsResponse, TenantCreateRequest,
-        TenantCreateResponse, TenantCreateResponseShard, TenantDescribeResponse,
-        TenantDescribeResponseShard, TenantLocateResponse, TenantPolicyRequest,
-        TenantShardMigrateRequest, TenantShardMigrateResponse,
+        AvailabilityZone, MetadataHealthRecord, MetadataHealthUpdateRequest, NodeAvailability,
+        NodeRegisterRequest, NodeSchedulingPolicy, NodeShard, NodeShardResponse, PlacementPolicy,
+        ShardSchedulingPolicy, ShardsPreferredAzsRequest, ShardsPreferredAzsResponse,
+        TenantCreateRequest, TenantCreateResponse, TenantCreateResponseShard,
+        TenantDescribeResponse, TenantDescribeResponseShard, TenantLocateResponse,
+        TenantPolicyRequest, TenantShardMigrateRequest, TenantShardMigrateResponse,
     },
     models::{
         SecondaryProgress, TenantConfigRequest, TimelineArchivalConfigRequest,
@@ -468,6 +468,7 @@ struct ShardSplitParams {
     policy: PlacementPolicy,
     config: TenantConfig,
     shard_ident: ShardIdentity,
+    preferred_az_id: Option<AvailabilityZone>,
 }
 
 // When preparing for a shard split, we may either choose to proceed with the split,
@@ -4103,7 +4104,7 @@ impl Service {
             for parent_id in parent_ids {
                 let child_ids = parent_id.split(new_shard_count);
 
-                let (pageserver, generation, policy, parent_ident, config) = {
+                let (pageserver, generation, policy, parent_ident, config, preferred_az) = {
                     let mut old_state = tenants
                         .remove(&parent_id)
                         .expect("It was present, we just split it");
@@ -4122,6 +4123,7 @@ impl Service {
                         old_state.policy.clone(),
                         old_state.shard,
                         old_state.config.clone(),
+                        old_state.preferred_az().cloned(),
                     )
                 };
 
@@ -4154,6 +4156,9 @@ impl Service {
                     };
                     child_state.generation = Some(generation);
                     child_state.config = config.clone();
+                    if let Some(preferred_az) = &preferred_az {
+                        child_state.set_preferred_az(preferred_az.clone());
+                    }
 
                     // The child's TenantShard::splitting is intentionally left at the default value of Idle,
                     // as at this point in the split process we have succeeded and this part is infallible:
@@ -4346,6 +4351,7 @@ impl Service {
         let mut policy = None;
         let mut config = None;
         let mut shard_ident = None;
+        let mut preferred_az_id = None;
         // Validate input, and calculate which shards we will create
         let (old_shard_count, targets) =
             {
@@ -4404,6 +4410,9 @@ impl Service {
                     if config.is_none() {
                         config = Some(shard.config.clone());
                     }
+                    if preferred_az_id.is_none() {
+                        preferred_az_id = shard.preferred_az().cloned();
+                    }
 
                     if tenant_shard_id.shard_count.count() == split_req.new_shard_count {
                         tracing::info!(
@@ -4474,6 +4483,7 @@ impl Service {
             policy,
             config,
             shard_ident,
+            preferred_az_id,
         })))
     }
 
@@ -4496,6 +4506,7 @@ impl Service {
             policy,
             config,
             shard_ident,
+            preferred_az_id,
         } = *params;
 
         // Drop any secondary locations: pageservers do not support splitting these, and in any case the
@@ -4569,7 +4580,7 @@ impl Service {
                     // Scheduling policies and preferred AZ do not carry through to children
                     scheduling_policy: serde_json::to_string(&ShardSchedulingPolicy::default())
                         .unwrap(),
-                    preferred_az_id: None,
+                    preferred_az_id: preferred_az_id.as_ref().map(|az| az.0.clone()),
                 });
             }
 
@@ -4689,47 +4700,6 @@ impl Service {
         let (response, child_locations, waiters) =
             self.tenant_shard_split_commit_inmem(tenant_id, new_shard_count, new_stripe_size);
 
-        // Now that we have scheduled the child shards, attempt to set their preferred AZ
-        // to that of the pageserver they've been attached on.
-        let preferred_azs = {
-            let locked = self.inner.read().unwrap();
-            child_locations
-                .iter()
-                .filter_map(|(tid, node_id, _stripe_size)| {
-                    let az_id = locked
-                        .nodes
-                        .get(node_id)
-                        .map(|n| n.get_availability_zone_id().clone())?;
-
-                    Some((*tid, az_id))
-                })
-                .collect::<Vec<_>>()
-        };
-
-        let updated = self
-            .persistence
-            .set_tenant_shard_preferred_azs(preferred_azs)
-            .await
-            .map_err(|err| {
-                ApiError::InternalServerError(anyhow::anyhow!(
-                    "Failed to persist preferred az ids: {err}"
-                ))
-            });
-
-        match updated {
-            Ok(updated) => {
-                let mut locked = self.inner.write().unwrap();
-                for (tid, az_id) in updated {
-                    if let Some(shard) = locked.tenants.get_mut(&tid) {
-                        shard.set_preferred_az(az_id);
-                    }
-                }
-            }
-            Err(err) => {
-                tracing::warn!("Failed to persist preferred AZs after split: {err}");
-            }
-        }
-
         // Send compute notifications for all the new shards
         let mut failed_notifications = Vec::new();
         for (child_id, child_ps, stripe_size) in child_locations {
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index 244893a616d4..f878116d533d 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -3057,7 +3057,11 @@ def assign_az(ps_cfg):
     for shard in shards:
         attached_to = shard["node_attached"]
         expected_az = env.get_pageserver(attached_to).az_id
-        assert shard["preferred_az_id"] == expected_az
+
+        # The scheduling optimization logic is not yet AZ-aware, so doesn't succeed
+        # in putting the tenant shards in the preferred AZ.
+        # To be fixed in https://github.com/neondatabase/neon/pull/9916
+        # assert shard["preferred_az_id"] == expected_az
 
 
 @run_only_on_default_postgres("Postgres version makes no difference here")

From dcb629532b075a68ba6a2aeeb3933e8ac73efbb9 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 3 Dec 2024 17:22:49 +0000
Subject: [PATCH 20/65] pageserver: only store SLRUs & aux files on shard zero
 (#9786)

## Problem

Since https://github.com/neondatabase/neon/pull/9423 the non-zero shards
no longer need SLRU content in order to do GC. This data is now
redundant on shards >0.

One release cycle after merging that PR, we may merge this one, which
also stops writing those pages to shards > 0, reaping the efficiency
benefit.

Closes: https://github.com/neondatabase/neon/issues/7512
Closes: https://github.com/neondatabase/neon/issues/9641

## Summary of changes

- Avoid storing SLRUs on non-zero shards
- Bonus: avoid storing aux files on non-zero shards
---
 libs/pageserver_api/src/key.rs                |  5 ++
 libs/pageserver_api/src/shard.rs              | 34 ++++++++---
 libs/wal_decoder/src/decoder.rs               | 54 ++++++++++--------
 pageserver/src/import_datadir.rs              | 18 ++++--
 pageserver/src/pgdatadir_mapping.rs           | 57 ++++++++++++-------
 .../src/tenant/timeline/import_pgdata/flow.rs | 49 +++++++---------
 pageserver/src/walingest.rs                   |  4 ++
 7 files changed, 135 insertions(+), 86 deletions(-)

diff --git a/libs/pageserver_api/src/key.rs b/libs/pageserver_api/src/key.rs
index 523d1433818b..37dff6fe4647 100644
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -770,6 +770,11 @@ impl Key {
             && self.field6 == 1
     }
 
+    #[inline(always)]
+    pub fn is_aux_file_key(&self) -> bool {
+        self.field1 == AUX_KEY_PREFIX
+    }
+
     /// Guaranteed to return `Ok()` if [`Self::is_rel_block_key`] returns `true` for `key`.
     #[inline(always)]
     pub fn to_rel_block(self) -> anyhow::Result<(RelTag, BlockNumber)> {
diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index e83cf4c855a1..a5c94a82c162 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -170,19 +170,37 @@ impl ShardIdentity {
         }
     }
 
+    /// Return true if the key should be stored on all shards, not just one.
+    fn is_key_global(&self, key: &Key) -> bool {
+        if key.is_slru_block_key() || key.is_slru_segment_size_key() || key.is_aux_file_key() {
+            // Special keys that are only stored on shard 0
+            false
+        } else if key.is_rel_block_key() {
+            // Ordinary relation blocks are distributed across shards
+            false
+        } else if key.is_rel_size_key() {
+            // All shards maintain rel size keys (although only shard 0 is responsible for
+            // keeping it strictly accurate, other shards just reflect the highest block they've ingested)
+            true
+        } else {
+            // For everything else, we assume it must be kept everywhere, because ingest code
+            // might assume this -- this covers functionality where the ingest code has
+            // not (yet) been made fully shard aware.
+            true
+        }
+    }
+
     /// Return true if the key should be discarded if found in this shard's
     /// data store, e.g. during compaction after a split.
     ///
     /// Shards _may_ drop keys which return false here, but are not obliged to.
     pub fn is_key_disposable(&self, key: &Key) -> bool {
-        if key_is_shard0(key) {
-            // Q: Why can't we dispose of shard0 content if we're not shard 0?
-            // A1: because the WAL ingestion logic currently ingests some shard 0
-            //     content on all shards, even though it's only read on shard 0.  If we
-            //     dropped it, then subsequent WAL ingest to these keys would encounter
-            //     an error.
-            // A2: because key_is_shard0 also covers relation size keys, which are written
-            //     on all shards even though they're only maintained accurately on shard 0.
+        if self.count < ShardCount(2) {
+            // Fast path: unsharded tenant doesn't dispose of anything
+            return false;
+        }
+
+        if self.is_key_global(key) {
             false
         } else {
             !self.is_key_local(key)
diff --git a/libs/wal_decoder/src/decoder.rs b/libs/wal_decoder/src/decoder.rs
index 36c4b19266aa..aa50c629113b 100644
--- a/libs/wal_decoder/src/decoder.rs
+++ b/libs/wal_decoder/src/decoder.rs
@@ -112,30 +112,38 @@ impl MetadataRecord {
         };
 
         // Next, filter the metadata record by shard.
-
-        // Route VM page updates to the shards that own them. VM pages are stored in the VM fork
-        // of the main relation. These are sharded and managed just like regular relation pages.
-        // See: https://github.com/neondatabase/neon/issues/9855
-        if let Some(
-            MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref mut clear_vm_bits))
-            | MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref mut clear_vm_bits)),
-        ) = metadata_record
-        {
-            let is_local_vm_page = |heap_blk| {
-                let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk);
-                shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk))
-            };
-            // Send the old and new VM page updates to their respective shards.
-            clear_vm_bits.old_heap_blkno = clear_vm_bits
-                .old_heap_blkno
-                .filter(|&blkno| is_local_vm_page(blkno));
-            clear_vm_bits.new_heap_blkno = clear_vm_bits
-                .new_heap_blkno
-                .filter(|&blkno| is_local_vm_page(blkno));
-            // If neither VM page belongs to this shard, discard the record.
-            if clear_vm_bits.old_heap_blkno.is_none() && clear_vm_bits.new_heap_blkno.is_none() {
-                metadata_record = None
+        match metadata_record {
+            Some(
+                MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref mut clear_vm_bits))
+                | MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref mut clear_vm_bits)),
+            ) => {
+                // Route VM page updates to the shards that own them. VM pages are stored in the VM fork
+                // of the main relation. These are sharded and managed just like regular relation pages.
+                // See: https://github.com/neondatabase/neon/issues/9855
+                let is_local_vm_page = |heap_blk| {
+                    let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk);
+                    shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk))
+                };
+                // Send the old and new VM page updates to their respective shards.
+                clear_vm_bits.old_heap_blkno = clear_vm_bits
+                    .old_heap_blkno
+                    .filter(|&blkno| is_local_vm_page(blkno));
+                clear_vm_bits.new_heap_blkno = clear_vm_bits
+                    .new_heap_blkno
+                    .filter(|&blkno| is_local_vm_page(blkno));
+                // If neither VM page belongs to this shard, discard the record.
+                if clear_vm_bits.old_heap_blkno.is_none() && clear_vm_bits.new_heap_blkno.is_none()
+                {
+                    metadata_record = None
+                }
             }
+            Some(MetadataRecord::LogicalMessage(LogicalMessageRecord::Put(_))) => {
+                // Filter LogicalMessage records (AUX files) to only be stored on shard zero
+                if !shard.is_shard_zero() {
+                    metadata_record = None;
+                }
+            }
+            _ => {}
         }
 
         Ok(metadata_record)
diff --git a/pageserver/src/import_datadir.rs b/pageserver/src/import_datadir.rs
index 06c4553e1c5b..c061714010a2 100644
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -575,18 +575,24 @@ async fn import_file(
     } else if file_path.starts_with("pg_xact") {
         let slru = SlruKind::Clog;
 
-        import_slru(modification, slru, file_path, reader, len, ctx).await?;
-        debug!("imported clog slru");
+        if modification.tline.tenant_shard_id.is_shard_zero() {
+            import_slru(modification, slru, file_path, reader, len, ctx).await?;
+            debug!("imported clog slru");
+        }
     } else if file_path.starts_with("pg_multixact/offsets") {
         let slru = SlruKind::MultiXactOffsets;
 
-        import_slru(modification, slru, file_path, reader, len, ctx).await?;
-        debug!("imported multixact offsets slru");
+        if modification.tline.tenant_shard_id.is_shard_zero() {
+            import_slru(modification, slru, file_path, reader, len, ctx).await?;
+            debug!("imported multixact offsets slru");
+        }
     } else if file_path.starts_with("pg_multixact/members") {
         let slru = SlruKind::MultiXactMembers;
 
-        import_slru(modification, slru, file_path, reader, len, ctx).await?;
-        debug!("imported multixact members slru");
+        if modification.tline.tenant_shard_id.is_shard_zero() {
+            import_slru(modification, slru, file_path, reader, len, ctx).await?;
+            debug!("imported multixact members slru");
+        }
     } else if file_path.starts_with("pg_twophase") {
         let bytes = read_all_bytes(reader).await?;
 
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index a00ec761e25c..255bd01e259e 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -530,6 +530,7 @@ impl Timeline {
         lsn: Lsn,
         ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
+        assert!(self.tenant_shard_id.is_shard_zero());
         let n_blocks = self
             .get_slru_segment_size(kind, segno, Version::Lsn(lsn), ctx)
             .await?;
@@ -552,6 +553,7 @@ impl Timeline {
         lsn: Lsn,
         ctx: &RequestContext,
     ) -> Result<Bytes, PageReconstructError> {
+        assert!(self.tenant_shard_id.is_shard_zero());
         let key = slru_block_to_key(kind, segno, blknum);
         self.get(key, lsn, ctx).await
     }
@@ -564,6 +566,7 @@ impl Timeline {
         version: Version<'_>,
         ctx: &RequestContext,
     ) -> Result<BlockNumber, PageReconstructError> {
+        assert!(self.tenant_shard_id.is_shard_zero());
         let key = slru_segment_size_to_key(kind, segno);
         let mut buf = version.get(self, key, ctx).await?;
         Ok(buf.get_u32_le())
@@ -577,6 +580,7 @@ impl Timeline {
         version: Version<'_>,
         ctx: &RequestContext,
     ) -> Result<bool, PageReconstructError> {
+        assert!(self.tenant_shard_id.is_shard_zero());
         // fetch directory listing
         let key = slru_dir_to_key(kind);
         let buf = version.get(self, key, ctx).await?;
@@ -1047,26 +1051,28 @@ impl Timeline {
         }
 
         // Iterate SLRUs next
-        for kind in [
-            SlruKind::Clog,
-            SlruKind::MultiXactMembers,
-            SlruKind::MultiXactOffsets,
-        ] {
-            let slrudir_key = slru_dir_to_key(kind);
-            result.add_key(slrudir_key);
-            let buf = self.get(slrudir_key, lsn, ctx).await?;
-            let dir = SlruSegmentDirectory::des(&buf)?;
-            let mut segments: Vec<u32> = dir.segments.iter().cloned().collect();
-            segments.sort_unstable();
-            for segno in segments {
-                let segsize_key = slru_segment_size_to_key(kind, segno);
-                let mut buf = self.get(segsize_key, lsn, ctx).await?;
-                let segsize = buf.get_u32_le();
-
-                result.add_range(
-                    slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, segsize),
-                );
-                result.add_key(segsize_key);
+        if self.tenant_shard_id.is_shard_zero() {
+            for kind in [
+                SlruKind::Clog,
+                SlruKind::MultiXactMembers,
+                SlruKind::MultiXactOffsets,
+            ] {
+                let slrudir_key = slru_dir_to_key(kind);
+                result.add_key(slrudir_key);
+                let buf = self.get(slrudir_key, lsn, ctx).await?;
+                let dir = SlruSegmentDirectory::des(&buf)?;
+                let mut segments: Vec<u32> = dir.segments.iter().cloned().collect();
+                segments.sort_unstable();
+                for segno in segments {
+                    let segsize_key = slru_segment_size_to_key(kind, segno);
+                    let mut buf = self.get(segsize_key, lsn, ctx).await?;
+                    let segsize = buf.get_u32_le();
+
+                    result.add_range(
+                        slru_block_to_key(kind, segno, 0)..slru_block_to_key(kind, segno, segsize),
+                    );
+                    result.add_key(segsize_key);
+                }
             }
         }
 
@@ -1468,6 +1474,10 @@ impl<'a> DatadirModification<'a> {
         blknum: BlockNumber,
         rec: NeonWalRecord,
     ) -> anyhow::Result<()> {
+        if !self.tline.tenant_shard_id.is_shard_zero() {
+            return Ok(());
+        }
+
         self.put(
             slru_block_to_key(kind, segno, blknum),
             Value::WalRecord(rec),
@@ -1501,6 +1511,8 @@ impl<'a> DatadirModification<'a> {
         blknum: BlockNumber,
         img: Bytes,
     ) -> anyhow::Result<()> {
+        assert!(self.tline.tenant_shard_id.is_shard_zero());
+
         let key = slru_block_to_key(kind, segno, blknum);
         if !key.is_valid_key_on_write_path() {
             anyhow::bail!(
@@ -1542,6 +1554,7 @@ impl<'a> DatadirModification<'a> {
         segno: u32,
         blknum: BlockNumber,
     ) -> anyhow::Result<()> {
+        assert!(self.tline.tenant_shard_id.is_shard_zero());
         let key = slru_block_to_key(kind, segno, blknum);
         if !key.is_valid_key_on_write_path() {
             anyhow::bail!(
@@ -1853,6 +1866,8 @@ impl<'a> DatadirModification<'a> {
         nblocks: BlockNumber,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
+        assert!(self.tline.tenant_shard_id.is_shard_zero());
+
         // Add it to the directory entry
         let dir_key = slru_dir_to_key(kind);
         let buf = self.get(dir_key, ctx).await?;
@@ -1885,6 +1900,8 @@ impl<'a> DatadirModification<'a> {
         segno: u32,
         nblocks: BlockNumber,
     ) -> anyhow::Result<()> {
+        assert!(self.tline.tenant_shard_id.is_shard_zero());
+
         // Put size
         let size_key = slru_segment_size_to_key(kind, segno);
         let buf = nblocks.to_le_bytes();
diff --git a/pageserver/src/tenant/timeline/import_pgdata/flow.rs b/pageserver/src/tenant/timeline/import_pgdata/flow.rs
index cbd4168c06e5..43880726063a 100644
--- a/pageserver/src/tenant/timeline/import_pgdata/flow.rs
+++ b/pageserver/src/tenant/timeline/import_pgdata/flow.rs
@@ -129,22 +129,23 @@ impl Flow {
         }
 
         // Import SLRUs
-
-        // pg_xact (01:00 keyspace)
-        self.import_slru(SlruKind::Clog, &self.storage.pgdata().join("pg_xact"))
+        if self.timeline.tenant_shard_id.is_shard_zero() {
+            // pg_xact (01:00 keyspace)
+            self.import_slru(SlruKind::Clog, &self.storage.pgdata().join("pg_xact"))
+                .await?;
+            // pg_multixact/members (01:01 keyspace)
+            self.import_slru(
+                SlruKind::MultiXactMembers,
+                &self.storage.pgdata().join("pg_multixact/members"),
+            )
             .await?;
-        // pg_multixact/members (01:01 keyspace)
-        self.import_slru(
-            SlruKind::MultiXactMembers,
-            &self.storage.pgdata().join("pg_multixact/members"),
-        )
-        .await?;
-        // pg_multixact/offsets (01:02 keyspace)
-        self.import_slru(
-            SlruKind::MultiXactOffsets,
-            &self.storage.pgdata().join("pg_multixact/offsets"),
-        )
-        .await?;
+            // pg_multixact/offsets (01:02 keyspace)
+            self.import_slru(
+                SlruKind::MultiXactOffsets,
+                &self.storage.pgdata().join("pg_multixact/offsets"),
+            )
+            .await?;
+        }
 
         // Import pg_twophase.
         // TODO: as empty
@@ -302,6 +303,8 @@ impl Flow {
     }
 
     async fn import_slru(&mut self, kind: SlruKind, path: &RemotePath) -> anyhow::Result<()> {
+        assert!(self.timeline.tenant_shard_id.is_shard_zero());
+
         let segments = self.storage.listfilesindir(path).await?;
         let segments: Vec<(String, u32, usize)> = segments
             .into_iter()
@@ -337,7 +340,6 @@ impl Flow {
             debug!(%p, segno=%segno, %size, %start_key, %end_key, "scheduling SLRU segment");
             self.tasks
                 .push(AnyImportTask::SlruBlocks(ImportSlruBlocksTask::new(
-                    *self.timeline.get_shard_identity(),
                     start_key..end_key,
                     &p,
                     self.storage.clone(),
@@ -631,21 +633,14 @@ impl ImportTask for ImportRelBlocksTask {
 }
 
 struct ImportSlruBlocksTask {
-    shard_identity: ShardIdentity,
     key_range: Range<Key>,
     path: RemotePath,
     storage: RemoteStorageWrapper,
 }
 
 impl ImportSlruBlocksTask {
-    fn new(
-        shard_identity: ShardIdentity,
-        key_range: Range<Key>,
-        path: &RemotePath,
-        storage: RemoteStorageWrapper,
-    ) -> Self {
+    fn new(key_range: Range<Key>, path: &RemotePath, storage: RemoteStorageWrapper) -> Self {
         ImportSlruBlocksTask {
-            shard_identity,
             key_range,
             path: path.clone(),
             storage,
@@ -673,17 +668,13 @@ impl ImportTask for ImportSlruBlocksTask {
         let mut file_offset = 0;
         while blknum < end_blk {
             let key = slru_block_to_key(kind, segno, blknum);
-            assert!(
-                !self.shard_identity.is_key_disposable(&key),
-                "SLRU keys need to go into every shard"
-            );
             let buf = &buf[file_offset..(file_offset + 8192)];
             file_offset += 8192;
             layer_writer
                 .put_image(key, Bytes::copy_from_slice(buf), ctx)
                 .await?;
-            blknum += 1;
             nimages += 1;
+            blknum += 1;
         }
         Ok(nimages)
     }
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index d568da596ab7..93ae88936f60 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1392,6 +1392,10 @@ impl WalIngest {
         img: Bytes,
         ctx: &RequestContext,
     ) -> Result<()> {
+        if !self.shard.is_shard_zero() {
+            return Ok(());
+        }
+
         self.handle_slru_extend(modification, kind, segno, blknum, ctx)
             .await?;
         modification.put_slru_page_image(kind, segno, blknum, img)?;

From b04ab468ee830676fe431975a89b1ce3ec781bac Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Tue, 3 Dec 2024 18:36:37 +0000
Subject: [PATCH 21/65] pageserver: more detailed logs when calling re-attach
 (#9996)

## Problem

We saw a peculiar case where a pageserver apparently got a 0-tenant
response to `/re-attach` but we couldn't see the request landing on a
storage controller. It was hard to confirm retrospectively that the
pageserver was configured properly at the moment it sent the request.

## Summary of changes

- Log the URL to which we are sending the request
- Log the NodeId and metadata that we sent
---
 libs/pageserver_api/src/controller_api.rs  |  4 ++--
 pageserver/src/controller_upcall_client.rs | 12 +++++++++---
 pageserver/src/tenant/mgr.rs               |  2 +-
 3 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 0ea30ce54f78..9a5ebc95bdd3 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -48,7 +48,7 @@ pub struct TenantCreateResponse {
     pub shards: Vec<TenantCreateResponseShard>,
 }
 
-#[derive(Serialize, Deserialize)]
+#[derive(Serialize, Deserialize, Debug, Clone)]
 pub struct NodeRegisterRequest {
     pub node_id: NodeId,
 
@@ -75,7 +75,7 @@ pub struct TenantPolicyRequest {
     pub scheduling: Option<ShardSchedulingPolicy>,
 }
 
-#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
+#[derive(Clone, Serialize, Deserialize, PartialEq, Eq, Hash, Debug)]
 pub struct AvailabilityZone(pub String);
 
 impl Display for AvailabilityZone {
diff --git a/pageserver/src/controller_upcall_client.rs b/pageserver/src/controller_upcall_client.rs
index 73fc6dc3ab1f..d41bfd9021c5 100644
--- a/pageserver/src/controller_upcall_client.rs
+++ b/pageserver/src/controller_upcall_client.rs
@@ -115,6 +115,10 @@ impl ControllerUpcallClient {
 
         Ok(res)
     }
+
+    pub(crate) fn base_url(&self) -> &Url {
+        &self.base_url
+    }
 }
 
 impl ControlPlaneGenerationsApi for ControllerUpcallClient {
@@ -191,13 +195,15 @@ impl ControlPlaneGenerationsApi for ControllerUpcallClient {
 
         let request = ReAttachRequest {
             node_id: self.node_id,
-            register,
+            register: register.clone(),
         };
 
         let response: ReAttachResponse = self.retry_http_forever(&re_attach_path, request).await?;
         tracing::info!(
-            "Received re-attach response with {} tenants",
-            response.tenants.len()
+            "Received re-attach response with {} tenants (node {}, register: {:?})",
+            response.tenants.len(),
+            self.node_id,
+            register,
         );
 
         failpoint_support::sleep_millis_async!("control-plane-client-re-attach");
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index eb8191e43e1c..45481c4ed44e 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -347,7 +347,7 @@ async fn init_load_generations(
         );
         emergency_generations(tenant_confs)
     } else if let Some(client) = ControllerUpcallClient::new(conf, cancel) {
-        info!("Calling control plane API to re-attach tenants");
+        info!("Calling {} API to re-attach tenants", client.base_url());
         // If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
         match client.re_attach(conf).await {
             Ok(tenants) => tenants

From 27a42d0f960c29b505b972841e0d79c1eab138fb Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 3 Dec 2024 18:39:23 +0000
Subject: [PATCH 22/65] chore(proxy): remove postgres config parser and md5
 support (#9990)

Keeping the `mock` postgres cplane adaptor using "stock" tokio-postgres
allows us to remove a lot of dead weight from our actual postgres
connection logic.
---
 Cargo.lock                                    |   2 +-
 libs/proxy/postgres-protocol2/Cargo.toml      |   1 -
 .../src/authentication/mod.rs                 |  35 --
 .../postgres-protocol2/src/message/backend.rs |   8 +-
 .../postgres-protocol2/src/password/mod.rs    |  18 -
 .../postgres-protocol2/src/password/test.rs   |   8 -
 libs/proxy/tokio-postgres2/src/config.rs      | 465 +-----------------
 libs/proxy/tokio-postgres2/src/connect_raw.rs |  19 +-
 libs/proxy/tokio-postgres2/src/error/mod.rs   |   6 -
 libs/proxy/tokio-postgres2/src/lib.rs         |  20 -
 proxy/Cargo.toml                              |   6 +-
 proxy/src/auth/backend/classic.rs             |   2 +-
 proxy/src/auth/backend/console_redirect.rs    |   2 +-
 proxy/src/auth/backend/mod.rs                 |   2 +-
 proxy/src/auth/flow.rs                        |   2 +-
 proxy/src/cancellation.rs                     |   6 +-
 proxy/src/compute.rs                          |  26 +-
 proxy/src/control_plane/client/mock.rs        |   3 +-
 proxy/src/control_plane/client/neon.rs        |   2 +-
 proxy/src/error.rs                            |   2 +-
 proxy/src/postgres_rustls/mod.rs              |   6 +-
 proxy/src/proxy/retry.rs                      |  16 +-
 proxy/src/proxy/tests/mitm.rs                 |  22 +-
 proxy/src/proxy/tests/mod.rs                  |  20 +-
 proxy/src/serverless/backend.rs               |  18 +-
 proxy/src/serverless/conn_pool.rs             |   6 +-
 proxy/src/serverless/conn_pool_lib.rs         |   4 +-
 proxy/src/serverless/json.rs                  |   6 +-
 proxy/src/serverless/local_conn_pool.rs       |  10 +-
 proxy/src/serverless/sql_over_http.rs         |  18 +-
 30 files changed, 96 insertions(+), 665 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index b2769e59f082..5b80ec5e93d8 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4209,7 +4209,6 @@ dependencies = [
  "bytes",
  "fallible-iterator",
  "hmac",
- "md-5",
  "memchr",
  "rand 0.8.5",
  "sha2",
@@ -4612,6 +4611,7 @@ dependencies = [
  "tikv-jemalloc-ctl",
  "tikv-jemallocator",
  "tokio",
+ "tokio-postgres",
  "tokio-postgres2",
  "tokio-rustls 0.26.0",
  "tokio-tungstenite",
diff --git a/libs/proxy/postgres-protocol2/Cargo.toml b/libs/proxy/postgres-protocol2/Cargo.toml
index 284a632954fd..f71c1599c7c2 100644
--- a/libs/proxy/postgres-protocol2/Cargo.toml
+++ b/libs/proxy/postgres-protocol2/Cargo.toml
@@ -10,7 +10,6 @@ byteorder.workspace = true
 bytes.workspace = true
 fallible-iterator.workspace = true
 hmac.workspace = true
-md-5 = "0.10"
 memchr = "2.0"
 rand.workspace = true
 sha2.workspace = true
diff --git a/libs/proxy/postgres-protocol2/src/authentication/mod.rs b/libs/proxy/postgres-protocol2/src/authentication/mod.rs
index 71afa4b9b60a..0bdc177143fb 100644
--- a/libs/proxy/postgres-protocol2/src/authentication/mod.rs
+++ b/libs/proxy/postgres-protocol2/src/authentication/mod.rs
@@ -1,37 +1,2 @@
 //! Authentication protocol support.
-use md5::{Digest, Md5};
-
 pub mod sasl;
-
-/// Hashes authentication information in a way suitable for use in response
-/// to an `AuthenticationMd5Password` message.
-///
-/// The resulting string should be sent back to the database in a
-/// `PasswordMessage` message.
-#[inline]
-pub fn md5_hash(username: &[u8], password: &[u8], salt: [u8; 4]) -> String {
-    let mut md5 = Md5::new();
-    md5.update(password);
-    md5.update(username);
-    let output = md5.finalize_reset();
-    md5.update(format!("{:x}", output));
-    md5.update(salt);
-    format!("md5{:x}", md5.finalize())
-}
-
-#[cfg(test)]
-mod test {
-    use super::*;
-
-    #[test]
-    fn md5() {
-        let username = b"md5_user";
-        let password = b"password";
-        let salt = [0x2a, 0x3d, 0x8f, 0xe0];
-
-        assert_eq!(
-            md5_hash(username, password, salt),
-            "md562af4dd09bbb41884907a838a3233294"
-        );
-    }
-}
diff --git a/libs/proxy/postgres-protocol2/src/message/backend.rs b/libs/proxy/postgres-protocol2/src/message/backend.rs
index 33d77fc25261..097964f9c110 100644
--- a/libs/proxy/postgres-protocol2/src/message/backend.rs
+++ b/libs/proxy/postgres-protocol2/src/message/backend.rs
@@ -79,7 +79,7 @@ pub enum Message {
     AuthenticationCleartextPassword,
     AuthenticationGss,
     AuthenticationKerberosV5,
-    AuthenticationMd5Password(AuthenticationMd5PasswordBody),
+    AuthenticationMd5Password,
     AuthenticationOk,
     AuthenticationScmCredential,
     AuthenticationSspi,
@@ -191,11 +191,7 @@ impl Message {
                 0 => Message::AuthenticationOk,
                 2 => Message::AuthenticationKerberosV5,
                 3 => Message::AuthenticationCleartextPassword,
-                5 => {
-                    let mut salt = [0; 4];
-                    buf.read_exact(&mut salt)?;
-                    Message::AuthenticationMd5Password(AuthenticationMd5PasswordBody { salt })
-                }
+                5 => Message::AuthenticationMd5Password,
                 6 => Message::AuthenticationScmCredential,
                 7 => Message::AuthenticationGss,
                 8 => Message::AuthenticationGssContinue,
diff --git a/libs/proxy/postgres-protocol2/src/password/mod.rs b/libs/proxy/postgres-protocol2/src/password/mod.rs
index e669e80f3f22..38eb31dfcf99 100644
--- a/libs/proxy/postgres-protocol2/src/password/mod.rs
+++ b/libs/proxy/postgres-protocol2/src/password/mod.rs
@@ -8,7 +8,6 @@
 
 use crate::authentication::sasl;
 use hmac::{Hmac, Mac};
-use md5::Md5;
 use rand::RngCore;
 use sha2::digest::FixedOutput;
 use sha2::{Digest, Sha256};
@@ -88,20 +87,3 @@ pub(crate) async fn scram_sha_256_salt(
         base64::encode(server_key)
     )
 }
-
-/// **Not recommended, as MD5 is not considered to be secure.**
-///
-/// Hash password using MD5 with the username as the salt.
-///
-/// The client may assume the returned string doesn't contain any
-/// special characters that would require escaping.
-pub fn md5(password: &[u8], username: &str) -> String {
-    // salt password with username
-    let mut salted_password = Vec::from(password);
-    salted_password.extend_from_slice(username.as_bytes());
-
-    let mut hash = Md5::new();
-    hash.update(&salted_password);
-    let digest = hash.finalize();
-    format!("md5{:x}", digest)
-}
diff --git a/libs/proxy/postgres-protocol2/src/password/test.rs b/libs/proxy/postgres-protocol2/src/password/test.rs
index c9d340f09d80..0692c07adbb1 100644
--- a/libs/proxy/postgres-protocol2/src/password/test.rs
+++ b/libs/proxy/postgres-protocol2/src/password/test.rs
@@ -9,11 +9,3 @@ async fn test_encrypt_scram_sha_256() {
         "SCRAM-SHA-256$4096:AQIDBAUGBwgJCgsMDQ4PEA==$8rrDg00OqaiWXJ7p+sCgHEIaBSHY89ZJl3mfIsf32oY=:05L1f+yZbiN8O0AnO40Og85NNRhvzTS57naKRWCcsIA="
     );
 }
-
-#[test]
-fn test_encrypt_md5() {
-    assert_eq!(
-        password::md5(b"secret", "foo"),
-        "md54ab2c5d00339c4b2a4e921d2dc4edec7"
-    );
-}
diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs
index 26124b38ef8f..5dad835c3bdd 100644
--- a/libs/proxy/tokio-postgres2/src/config.rs
+++ b/libs/proxy/tokio-postgres2/src/config.rs
@@ -6,11 +6,9 @@ use crate::connect_raw::RawConnection;
 use crate::tls::MakeTlsConnect;
 use crate::tls::TlsConnect;
 use crate::{Client, Connection, Error};
-use std::borrow::Cow;
+use std::fmt;
 use std::str;
-use std::str::FromStr;
 use std::time::Duration;
-use std::{error, fmt, iter, mem};
 use tokio::io::{AsyncRead, AsyncWrite};
 
 pub use postgres_protocol2::authentication::sasl::ScramKeys;
@@ -380,99 +378,6 @@ impl Config {
         self.max_backend_message_size
     }
 
-    fn param(&mut self, key: &str, value: &str) -> Result<(), Error> {
-        match key {
-            "user" => {
-                self.user(value);
-            }
-            "password" => {
-                self.password(value);
-            }
-            "dbname" => {
-                self.dbname(value);
-            }
-            "options" => {
-                self.options(value);
-            }
-            "application_name" => {
-                self.application_name(value);
-            }
-            "sslmode" => {
-                let mode = match value {
-                    "disable" => SslMode::Disable,
-                    "prefer" => SslMode::Prefer,
-                    "require" => SslMode::Require,
-                    _ => return Err(Error::config_parse(Box::new(InvalidValue("sslmode")))),
-                };
-                self.ssl_mode(mode);
-            }
-            "host" => {
-                for host in value.split(',') {
-                    self.host(host);
-                }
-            }
-            "port" => {
-                for port in value.split(',') {
-                    let port = if port.is_empty() {
-                        5432
-                    } else {
-                        port.parse()
-                            .map_err(|_| Error::config_parse(Box::new(InvalidValue("port"))))?
-                    };
-                    self.port(port);
-                }
-            }
-            "connect_timeout" => {
-                let timeout = value
-                    .parse::<i64>()
-                    .map_err(|_| Error::config_parse(Box::new(InvalidValue("connect_timeout"))))?;
-                if timeout > 0 {
-                    self.connect_timeout(Duration::from_secs(timeout as u64));
-                }
-            }
-            "target_session_attrs" => {
-                let target_session_attrs = match value {
-                    "any" => TargetSessionAttrs::Any,
-                    "read-write" => TargetSessionAttrs::ReadWrite,
-                    _ => {
-                        return Err(Error::config_parse(Box::new(InvalidValue(
-                            "target_session_attrs",
-                        ))));
-                    }
-                };
-                self.target_session_attrs(target_session_attrs);
-            }
-            "channel_binding" => {
-                let channel_binding = match value {
-                    "disable" => ChannelBinding::Disable,
-                    "prefer" => ChannelBinding::Prefer,
-                    "require" => ChannelBinding::Require,
-                    _ => {
-                        return Err(Error::config_parse(Box::new(InvalidValue(
-                            "channel_binding",
-                        ))))
-                    }
-                };
-                self.channel_binding(channel_binding);
-            }
-            "max_backend_message_size" => {
-                let limit = value.parse::<usize>().map_err(|_| {
-                    Error::config_parse(Box::new(InvalidValue("max_backend_message_size")))
-                })?;
-                if limit > 0 {
-                    self.max_backend_message_size(limit);
-                }
-            }
-            key => {
-                return Err(Error::config_parse(Box::new(UnknownOption(
-                    key.to_string(),
-                ))));
-            }
-        }
-
-        Ok(())
-    }
-
     /// Opens a connection to a PostgreSQL database.
     ///
     /// Requires the `runtime` Cargo feature (enabled by default).
@@ -499,17 +404,6 @@ impl Config {
     }
 }
 
-impl FromStr for Config {
-    type Err = Error;
-
-    fn from_str(s: &str) -> Result<Config, Error> {
-        match UrlParser::parse(s)? {
-            Some(config) => Ok(config),
-            None => Parser::parse(s),
-        }
-    }
-}
-
 // Omit password from debug output
 impl fmt::Debug for Config {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
@@ -536,360 +430,3 @@ impl fmt::Debug for Config {
             .finish()
     }
 }
-
-#[derive(Debug)]
-struct UnknownOption(String);
-
-impl fmt::Display for UnknownOption {
-    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(fmt, "unknown option `{}`", self.0)
-    }
-}
-
-impl error::Error for UnknownOption {}
-
-#[derive(Debug)]
-struct InvalidValue(&'static str);
-
-impl fmt::Display for InvalidValue {
-    fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
-        write!(fmt, "invalid value for option `{}`", self.0)
-    }
-}
-
-impl error::Error for InvalidValue {}
-
-struct Parser<'a> {
-    s: &'a str,
-    it: iter::Peekable<str::CharIndices<'a>>,
-}
-
-impl<'a> Parser<'a> {
-    fn parse(s: &'a str) -> Result<Config, Error> {
-        let mut parser = Parser {
-            s,
-            it: s.char_indices().peekable(),
-        };
-
-        let mut config = Config::new();
-
-        while let Some((key, value)) = parser.parameter()? {
-            config.param(key, &value)?;
-        }
-
-        Ok(config)
-    }
-
-    fn skip_ws(&mut self) {
-        self.take_while(char::is_whitespace);
-    }
-
-    fn take_while<F>(&mut self, f: F) -> &'a str
-    where
-        F: Fn(char) -> bool,
-    {
-        let start = match self.it.peek() {
-            Some(&(i, _)) => i,
-            None => return "",
-        };
-
-        loop {
-            match self.it.peek() {
-                Some(&(_, c)) if f(c) => {
-                    self.it.next();
-                }
-                Some(&(i, _)) => return &self.s[start..i],
-                None => return &self.s[start..],
-            }
-        }
-    }
-
-    fn eat(&mut self, target: char) -> Result<(), Error> {
-        match self.it.next() {
-            Some((_, c)) if c == target => Ok(()),
-            Some((i, c)) => {
-                let m = format!(
-                    "unexpected character at byte {}: expected `{}` but got `{}`",
-                    i, target, c
-                );
-                Err(Error::config_parse(m.into()))
-            }
-            None => Err(Error::config_parse("unexpected EOF".into())),
-        }
-    }
-
-    fn eat_if(&mut self, target: char) -> bool {
-        match self.it.peek() {
-            Some(&(_, c)) if c == target => {
-                self.it.next();
-                true
-            }
-            _ => false,
-        }
-    }
-
-    fn keyword(&mut self) -> Option<&'a str> {
-        let s = self.take_while(|c| match c {
-            c if c.is_whitespace() => false,
-            '=' => false,
-            _ => true,
-        });
-
-        if s.is_empty() {
-            None
-        } else {
-            Some(s)
-        }
-    }
-
-    fn value(&mut self) -> Result<String, Error> {
-        let value = if self.eat_if('\'') {
-            let value = self.quoted_value()?;
-            self.eat('\'')?;
-            value
-        } else {
-            self.simple_value()?
-        };
-
-        Ok(value)
-    }
-
-    fn simple_value(&mut self) -> Result<String, Error> {
-        let mut value = String::new();
-
-        while let Some(&(_, c)) = self.it.peek() {
-            if c.is_whitespace() {
-                break;
-            }
-
-            self.it.next();
-            if c == '\\' {
-                if let Some((_, c2)) = self.it.next() {
-                    value.push(c2);
-                }
-            } else {
-                value.push(c);
-            }
-        }
-
-        if value.is_empty() {
-            return Err(Error::config_parse("unexpected EOF".into()));
-        }
-
-        Ok(value)
-    }
-
-    fn quoted_value(&mut self) -> Result<String, Error> {
-        let mut value = String::new();
-
-        while let Some(&(_, c)) = self.it.peek() {
-            if c == '\'' {
-                return Ok(value);
-            }
-
-            self.it.next();
-            if c == '\\' {
-                if let Some((_, c2)) = self.it.next() {
-                    value.push(c2);
-                }
-            } else {
-                value.push(c);
-            }
-        }
-
-        Err(Error::config_parse(
-            "unterminated quoted connection parameter value".into(),
-        ))
-    }
-
-    fn parameter(&mut self) -> Result<Option<(&'a str, String)>, Error> {
-        self.skip_ws();
-        let keyword = match self.keyword() {
-            Some(keyword) => keyword,
-            None => return Ok(None),
-        };
-        self.skip_ws();
-        self.eat('=')?;
-        self.skip_ws();
-        let value = self.value()?;
-
-        Ok(Some((keyword, value)))
-    }
-}
-
-// This is a pretty sloppy "URL" parser, but it matches the behavior of libpq, where things really aren't very strict
-struct UrlParser<'a> {
-    s: &'a str,
-    config: Config,
-}
-
-impl<'a> UrlParser<'a> {
-    fn parse(s: &'a str) -> Result<Option<Config>, Error> {
-        let s = match Self::remove_url_prefix(s) {
-            Some(s) => s,
-            None => return Ok(None),
-        };
-
-        let mut parser = UrlParser {
-            s,
-            config: Config::new(),
-        };
-
-        parser.parse_credentials()?;
-        parser.parse_host()?;
-        parser.parse_path()?;
-        parser.parse_params()?;
-
-        Ok(Some(parser.config))
-    }
-
-    fn remove_url_prefix(s: &str) -> Option<&str> {
-        for prefix in &["postgres://", "postgresql://"] {
-            if let Some(stripped) = s.strip_prefix(prefix) {
-                return Some(stripped);
-            }
-        }
-
-        None
-    }
-
-    fn take_until(&mut self, end: &[char]) -> Option<&'a str> {
-        match self.s.find(end) {
-            Some(pos) => {
-                let (head, tail) = self.s.split_at(pos);
-                self.s = tail;
-                Some(head)
-            }
-            None => None,
-        }
-    }
-
-    fn take_all(&mut self) -> &'a str {
-        mem::take(&mut self.s)
-    }
-
-    fn eat_byte(&mut self) {
-        self.s = &self.s[1..];
-    }
-
-    fn parse_credentials(&mut self) -> Result<(), Error> {
-        let creds = match self.take_until(&['@']) {
-            Some(creds) => creds,
-            None => return Ok(()),
-        };
-        self.eat_byte();
-
-        let mut it = creds.splitn(2, ':');
-        let user = self.decode(it.next().unwrap())?;
-        self.config.user(&user);
-
-        if let Some(password) = it.next() {
-            let password = Cow::from(percent_encoding::percent_decode(password.as_bytes()));
-            self.config.password(password);
-        }
-
-        Ok(())
-    }
-
-    fn parse_host(&mut self) -> Result<(), Error> {
-        let host = match self.take_until(&['/', '?']) {
-            Some(host) => host,
-            None => self.take_all(),
-        };
-
-        if host.is_empty() {
-            return Ok(());
-        }
-
-        for chunk in host.split(',') {
-            let (host, port) = if chunk.starts_with('[') {
-                let idx = match chunk.find(']') {
-                    Some(idx) => idx,
-                    None => return Err(Error::config_parse(InvalidValue("host").into())),
-                };
-
-                let host = &chunk[1..idx];
-                let remaining = &chunk[idx + 1..];
-                let port = if let Some(port) = remaining.strip_prefix(':') {
-                    Some(port)
-                } else if remaining.is_empty() {
-                    None
-                } else {
-                    return Err(Error::config_parse(InvalidValue("host").into()));
-                };
-
-                (host, port)
-            } else {
-                let mut it = chunk.splitn(2, ':');
-                (it.next().unwrap(), it.next())
-            };
-
-            self.host_param(host)?;
-            let port = self.decode(port.unwrap_or("5432"))?;
-            self.config.param("port", &port)?;
-        }
-
-        Ok(())
-    }
-
-    fn parse_path(&mut self) -> Result<(), Error> {
-        if !self.s.starts_with('/') {
-            return Ok(());
-        }
-        self.eat_byte();
-
-        let dbname = match self.take_until(&['?']) {
-            Some(dbname) => dbname,
-            None => self.take_all(),
-        };
-
-        if !dbname.is_empty() {
-            self.config.dbname(&self.decode(dbname)?);
-        }
-
-        Ok(())
-    }
-
-    fn parse_params(&mut self) -> Result<(), Error> {
-        if !self.s.starts_with('?') {
-            return Ok(());
-        }
-        self.eat_byte();
-
-        while !self.s.is_empty() {
-            let key = match self.take_until(&['=']) {
-                Some(key) => self.decode(key)?,
-                None => return Err(Error::config_parse("unterminated parameter".into())),
-            };
-            self.eat_byte();
-
-            let value = match self.take_until(&['&']) {
-                Some(value) => {
-                    self.eat_byte();
-                    value
-                }
-                None => self.take_all(),
-            };
-
-            if key == "host" {
-                self.host_param(value)?;
-            } else {
-                let value = self.decode(value)?;
-                self.config.param(&key, &value)?;
-            }
-        }
-
-        Ok(())
-    }
-
-    fn host_param(&mut self, s: &str) -> Result<(), Error> {
-        let s = self.decode(s)?;
-        self.config.param("host", &s)
-    }
-
-    fn decode(&self, s: &'a str) -> Result<Cow<'a, str>, Error> {
-        percent_encoding::percent_decode(s.as_bytes())
-            .decode_utf8()
-            .map_err(|e| Error::config_parse(e.into()))
-    }
-}
diff --git a/libs/proxy/tokio-postgres2/src/connect_raw.rs b/libs/proxy/tokio-postgres2/src/connect_raw.rs
index 9c6f1a255200..390f133002be 100644
--- a/libs/proxy/tokio-postgres2/src/connect_raw.rs
+++ b/libs/proxy/tokio-postgres2/src/connect_raw.rs
@@ -7,7 +7,6 @@ use crate::Error;
 use bytes::BytesMut;
 use fallible_iterator::FallibleIterator;
 use futures_util::{ready, Sink, SinkExt, Stream, TryStreamExt};
-use postgres_protocol2::authentication;
 use postgres_protocol2::authentication::sasl;
 use postgres_protocol2::authentication::sasl::ScramSha256;
 use postgres_protocol2::message::backend::{AuthenticationSaslBody, Message, NoticeResponseBody};
@@ -174,25 +173,11 @@ where
 
             authenticate_password(stream, pass).await?;
         }
-        Some(Message::AuthenticationMd5Password(body)) => {
-            can_skip_channel_binding(config)?;
-
-            let user = config
-                .user
-                .as_ref()
-                .ok_or_else(|| Error::config("user missing".into()))?;
-            let pass = config
-                .password
-                .as_ref()
-                .ok_or_else(|| Error::config("password missing".into()))?;
-
-            let output = authentication::md5_hash(user.as_bytes(), pass, body.salt());
-            authenticate_password(stream, output.as_bytes()).await?;
-        }
         Some(Message::AuthenticationSasl(body)) => {
             authenticate_sasl(stream, body, config).await?;
         }
-        Some(Message::AuthenticationKerberosV5)
+        Some(Message::AuthenticationMd5Password)
+        | Some(Message::AuthenticationKerberosV5)
         | Some(Message::AuthenticationScmCredential)
         | Some(Message::AuthenticationGss)
         | Some(Message::AuthenticationSspi) => {
diff --git a/libs/proxy/tokio-postgres2/src/error/mod.rs b/libs/proxy/tokio-postgres2/src/error/mod.rs
index 651432225009..922c348525c6 100644
--- a/libs/proxy/tokio-postgres2/src/error/mod.rs
+++ b/libs/proxy/tokio-postgres2/src/error/mod.rs
@@ -349,7 +349,6 @@ enum Kind {
     Parse,
     Encode,
     Authentication,
-    ConfigParse,
     Config,
     Connect,
     Timeout,
@@ -386,7 +385,6 @@ impl fmt::Display for Error {
             Kind::Parse => fmt.write_str("error parsing response from server")?,
             Kind::Encode => fmt.write_str("error encoding message to server")?,
             Kind::Authentication => fmt.write_str("authentication error")?,
-            Kind::ConfigParse => fmt.write_str("invalid connection string")?,
             Kind::Config => fmt.write_str("invalid configuration")?,
             Kind::Connect => fmt.write_str("error connecting to server")?,
             Kind::Timeout => fmt.write_str("timeout waiting for server")?,
@@ -482,10 +480,6 @@ impl Error {
         Error::new(Kind::Authentication, Some(e))
     }
 
-    pub(crate) fn config_parse(e: Box<dyn error::Error + Sync + Send>) -> Error {
-        Error::new(Kind::ConfigParse, Some(e))
-    }
-
     pub(crate) fn config(e: Box<dyn error::Error + Sync + Send>) -> Error {
         Error::new(Kind::Config, Some(e))
     }
diff --git a/libs/proxy/tokio-postgres2/src/lib.rs b/libs/proxy/tokio-postgres2/src/lib.rs
index 57c639a7de51..901ed0c96c68 100644
--- a/libs/proxy/tokio-postgres2/src/lib.rs
+++ b/libs/proxy/tokio-postgres2/src/lib.rs
@@ -13,14 +13,12 @@ pub use crate::query::RowStream;
 pub use crate::row::{Row, SimpleQueryRow};
 pub use crate::simple_query::SimpleQueryStream;
 pub use crate::statement::{Column, Statement};
-use crate::tls::MakeTlsConnect;
 pub use crate::tls::NoTls;
 pub use crate::to_statement::ToStatement;
 pub use crate::transaction::Transaction;
 pub use crate::transaction_builder::{IsolationLevel, TransactionBuilder};
 use crate::types::ToSql;
 use postgres_protocol2::message::backend::ReadyForQueryBody;
-use tokio::net::TcpStream;
 
 /// After executing a query, the connection will be in one of these states
 #[derive(Clone, Copy, Debug, PartialEq)]
@@ -72,24 +70,6 @@ mod transaction;
 mod transaction_builder;
 pub mod types;
 
-/// A convenience function which parses a connection string and connects to the database.
-///
-/// See the documentation for [`Config`] for details on the connection string format.
-///
-/// Requires the `runtime` Cargo feature (enabled by default).
-///
-/// [`Config`]: config/struct.Config.html
-pub async fn connect<T>(
-    config: &str,
-    tls: T,
-) -> Result<(Client, Connection<TcpStream, T::Stream>), Error>
-where
-    T: MakeTlsConnect<TcpStream>,
-{
-    let config = config.parse::<Config>()?;
-    config.connect(tls).await
-}
-
 /// An asynchronous notification.
 #[derive(Clone, Debug)]
 pub struct Notification {
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index f5934c8a89dd..2f63ee3acc42 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -6,7 +6,7 @@ license.workspace = true
 
 [features]
 default = []
-testing = []
+testing = ["dep:tokio-postgres"]
 
 [dependencies]
 ahash.workspace = true
@@ -55,6 +55,7 @@ parquet.workspace = true
 parquet_derive.workspace = true
 pin-project-lite.workspace = true
 postgres_backend.workspace = true
+postgres-client = { package = "tokio-postgres2", path = "../libs/proxy/tokio-postgres2" }
 postgres-protocol = { package = "postgres-protocol2", path = "../libs/proxy/postgres-protocol2" }
 pq_proto.workspace = true
 prometheus.workspace = true
@@ -81,7 +82,7 @@ subtle.workspace = true
 thiserror.workspace = true
 tikv-jemallocator.workspace = true
 tikv-jemalloc-ctl = { workspace = true, features = ["use_std"] }
-tokio-postgres = { package = "tokio-postgres2", path = "../libs/proxy/tokio-postgres2" }
+tokio-postgres = { workspace = true, optional = true }
 tokio-rustls.workspace = true
 tokio-util.workspace = true
 tokio = { workspace = true, features = ["signal"] }
@@ -119,3 +120,4 @@ rcgen.workspace = true
 rstest.workspace = true
 walkdir.workspace = true
 rand_distr = "0.4"
+tokio-postgres.workspace = true
diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index 491b272ac4e8..5e494dfdd694 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -66,7 +66,7 @@ pub(super) async fn authenticate(
 
     Ok(ComputeCredentials {
         info: creds,
-        keys: ComputeCredentialKeys::AuthKeys(tokio_postgres::config::AuthKeys::ScramSha256(
+        keys: ComputeCredentialKeys::AuthKeys(postgres_client::config::AuthKeys::ScramSha256(
             scram_keys,
         )),
     })
diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs
index bf7a1cb0705f..494564de05f0 100644
--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -1,8 +1,8 @@
 use async_trait::async_trait;
+use postgres_client::config::SslMode;
 use pq_proto::BeMessage as Be;
 use thiserror::Error;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tokio_postgres::config::SslMode;
 use tracing::{info, info_span};
 
 use super::ComputeCredentialKeys;
diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index 7e1b26a11a0d..84a572dcf9f1 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -11,8 +11,8 @@ pub use console_redirect::ConsoleRedirectBackend;
 pub(crate) use console_redirect::ConsoleRedirectError;
 use ipnet::{Ipv4Net, Ipv6Net};
 use local::LocalBackend;
+use postgres_client::config::AuthKeys;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tokio_postgres::config::AuthKeys;
 use tracing::{debug, info, warn};
 
 use crate::auth::credentials::check_peer_addr_is_in_list;
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index 9c6ce151cba9..60d1962d7f78 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -227,7 +227,7 @@ pub(crate) async fn validate_password_and_exchange(
             };
 
             Ok(sasl::Outcome::Success(ComputeCredentialKeys::AuthKeys(
-                tokio_postgres::config::AuthKeys::ScramSha256(keys),
+                postgres_client::config::AuthKeys::ScramSha256(keys),
             )))
         }
     }
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index 91e198bf88a2..bcb0ef40bd74 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -3,11 +3,11 @@ use std::sync::Arc;
 
 use dashmap::DashMap;
 use ipnet::{IpNet, Ipv4Net, Ipv6Net};
+use postgres_client::{CancelToken, NoTls};
 use pq_proto::CancelKeyData;
 use thiserror::Error;
 use tokio::net::TcpStream;
 use tokio::sync::Mutex;
-use tokio_postgres::{CancelToken, NoTls};
 use tracing::{debug, info};
 use uuid::Uuid;
 
@@ -44,7 +44,7 @@ pub(crate) enum CancelError {
     IO(#[from] std::io::Error),
 
     #[error("{0}")]
-    Postgres(#[from] tokio_postgres::Error),
+    Postgres(#[from] postgres_client::Error),
 
     #[error("rate limit exceeded")]
     RateLimit,
@@ -70,7 +70,7 @@ impl ReportableError for CancelError {
 impl<P: CancellationPublisher> CancellationHandler<P> {
     /// Run async action within an ephemeral session identified by [`CancelKeyData`].
     pub(crate) fn get_session(self: Arc<Self>) -> Session<P> {
-        // HACK: We'd rather get the real backend_pid but tokio_postgres doesn't
+        // HACK: We'd rather get the real backend_pid but postgres_client doesn't
         // expose it and we don't want to do another roundtrip to query
         // for it. The client will be able to notice that this is not the
         // actual backend_pid, but backend_pid is not used for anything
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index b689b97a2100..06bc71c55988 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -6,6 +6,8 @@ use std::time::Duration;
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
 use once_cell::sync::OnceCell;
+use postgres_client::tls::MakeTlsConnect;
+use postgres_client::{CancelToken, RawConnection};
 use postgres_protocol::message::backend::NoticeResponseBody;
 use pq_proto::StartupMessageParams;
 use rustls::client::danger::ServerCertVerifier;
@@ -13,8 +15,6 @@ use rustls::crypto::ring;
 use rustls::pki_types::InvalidDnsNameError;
 use thiserror::Error;
 use tokio::net::TcpStream;
-use tokio_postgres::tls::MakeTlsConnect;
-use tokio_postgres::{CancelToken, RawConnection};
 use tracing::{debug, error, info, warn};
 
 use crate::auth::parse_endpoint_param;
@@ -34,9 +34,9 @@ pub const COULD_NOT_CONNECT: &str = "Couldn't connect to compute node";
 #[derive(Debug, Error)]
 pub(crate) enum ConnectionError {
     /// This error doesn't seem to reveal any secrets; for instance,
-    /// `tokio_postgres::error::Kind` doesn't contain ip addresses and such.
+    /// `postgres_client::error::Kind` doesn't contain ip addresses and such.
     #[error("{COULD_NOT_CONNECT}: {0}")]
-    Postgres(#[from] tokio_postgres::Error),
+    Postgres(#[from] postgres_client::Error),
 
     #[error("{COULD_NOT_CONNECT}: {0}")]
     CouldNotConnect(#[from] io::Error),
@@ -99,13 +99,13 @@ impl ReportableError for ConnectionError {
 }
 
 /// A pair of `ClientKey` & `ServerKey` for `SCRAM-SHA-256`.
-pub(crate) type ScramKeys = tokio_postgres::config::ScramKeys<32>;
+pub(crate) type ScramKeys = postgres_client::config::ScramKeys<32>;
 
 /// A config for establishing a connection to compute node.
-/// Eventually, `tokio_postgres` will be replaced with something better.
+/// Eventually, `postgres_client` will be replaced with something better.
 /// Newtype allows us to implement methods on top of it.
 #[derive(Clone, Default)]
-pub(crate) struct ConnCfg(Box<tokio_postgres::Config>);
+pub(crate) struct ConnCfg(Box<postgres_client::Config>);
 
 /// Creation and initialization routines.
 impl ConnCfg {
@@ -126,7 +126,7 @@ impl ConnCfg {
 
     pub(crate) fn get_host(&self) -> Result<Host, WakeComputeError> {
         match self.0.get_hosts() {
-            [tokio_postgres::config::Host::Tcp(s)] => Ok(s.into()),
+            [postgres_client::config::Host::Tcp(s)] => Ok(s.into()),
             // we should not have multiple address or unix addresses.
             _ => Err(WakeComputeError::BadComputeAddress(
                 "invalid compute address".into(),
@@ -160,7 +160,7 @@ impl ConnCfg {
 
         // TODO: This is especially ugly...
         if let Some(replication) = params.get("replication") {
-            use tokio_postgres::config::ReplicationMode;
+            use postgres_client::config::ReplicationMode;
             match replication {
                 "true" | "on" | "yes" | "1" => {
                     self.replication_mode(ReplicationMode::Physical);
@@ -182,7 +182,7 @@ impl ConnCfg {
 }
 
 impl std::ops::Deref for ConnCfg {
-    type Target = tokio_postgres::Config;
+    type Target = postgres_client::Config;
 
     fn deref(&self) -> &Self::Target {
         &self.0
@@ -199,7 +199,7 @@ impl std::ops::DerefMut for ConnCfg {
 impl ConnCfg {
     /// Establish a raw TCP connection to the compute node.
     async fn connect_raw(&self, timeout: Duration) -> io::Result<(SocketAddr, TcpStream, &str)> {
-        use tokio_postgres::config::Host;
+        use postgres_client::config::Host;
 
         // wrap TcpStream::connect with timeout
         let connect_with_timeout = |host, port| {
@@ -224,7 +224,7 @@ impl ConnCfg {
             })
         };
 
-        // We can't reuse connection establishing logic from `tokio_postgres` here,
+        // We can't reuse connection establishing logic from `postgres_client` here,
         // because it has no means for extracting the underlying socket which we
         // require for our business.
         let mut connection_error = None;
@@ -272,7 +272,7 @@ type RustlsStream = <MakeRustlsConnect as MakeTlsConnect<tokio::net::TcpStream>>
 pub(crate) struct PostgresConnection {
     /// Socket connected to a compute node.
     pub(crate) stream:
-        tokio_postgres::maybe_tls_stream::MaybeTlsStream<tokio::net::TcpStream, RustlsStream>,
+        postgres_client::maybe_tls_stream::MaybeTlsStream<tokio::net::TcpStream, RustlsStream>,
     /// PostgreSQL connection parameters.
     pub(crate) params: std::collections::HashMap<String, String>,
     /// Query cancellation token.
diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs
index 9537d717a1f1..4d55f96ca198 100644
--- a/proxy/src/control_plane/client/mock.rs
+++ b/proxy/src/control_plane/client/mock.rs
@@ -5,7 +5,6 @@ use std::sync::Arc;
 
 use futures::TryFutureExt;
 use thiserror::Error;
-use tokio_postgres::config::SslMode;
 use tokio_postgres::Client;
 use tracing::{error, info, info_span, warn, Instrument};
 
@@ -165,7 +164,7 @@ impl MockControlPlane {
         config
             .host(self.endpoint.host_str().unwrap_or("localhost"))
             .port(self.endpoint.port().unwrap_or(5432))
-            .ssl_mode(SslMode::Disable);
+            .ssl_mode(postgres_client::config::SslMode::Disable);
 
         let node = NodeInfo {
             config,
diff --git a/proxy/src/control_plane/client/neon.rs b/proxy/src/control_plane/client/neon.rs
index 2cad981d01ac..5a78ec9d32aa 100644
--- a/proxy/src/control_plane/client/neon.rs
+++ b/proxy/src/control_plane/client/neon.rs
@@ -6,8 +6,8 @@ use std::time::Duration;
 use ::http::header::AUTHORIZATION;
 use ::http::HeaderName;
 use futures::TryFutureExt;
+use postgres_client::config::SslMode;
 use tokio::time::Instant;
-use tokio_postgres::config::SslMode;
 use tracing::{debug, info, info_span, warn, Instrument};
 
 use super::super::messages::{ControlPlaneErrorMessage, GetRoleSecret, WakeCompute};
diff --git a/proxy/src/error.rs b/proxy/src/error.rs
index 2221aac407fc..6a379499dc62 100644
--- a/proxy/src/error.rs
+++ b/proxy/src/error.rs
@@ -84,7 +84,7 @@ pub(crate) trait ReportableError: fmt::Display + Send + 'static {
     fn get_error_kind(&self) -> ErrorKind;
 }
 
-impl ReportableError for tokio_postgres::error::Error {
+impl ReportableError for postgres_client::error::Error {
     fn get_error_kind(&self) -> ErrorKind {
         if self.as_db_error().is_some() {
             ErrorKind::Postgres
diff --git a/proxy/src/postgres_rustls/mod.rs b/proxy/src/postgres_rustls/mod.rs
index 31e7915e89fd..5ef20991c309 100644
--- a/proxy/src/postgres_rustls/mod.rs
+++ b/proxy/src/postgres_rustls/mod.rs
@@ -1,10 +1,10 @@
 use std::convert::TryFrom;
 use std::sync::Arc;
 
+use postgres_client::tls::MakeTlsConnect;
 use rustls::pki_types::ServerName;
 use rustls::ClientConfig;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tokio_postgres::tls::MakeTlsConnect;
 
 mod private {
     use std::future::Future;
@@ -12,9 +12,9 @@ mod private {
     use std::pin::Pin;
     use std::task::{Context, Poll};
 
+    use postgres_client::tls::{ChannelBinding, TlsConnect};
     use rustls::pki_types::ServerName;
     use tokio::io::{AsyncRead, AsyncWrite, ReadBuf};
-    use tokio_postgres::tls::{ChannelBinding, TlsConnect};
     use tokio_rustls::client::TlsStream;
     use tokio_rustls::TlsConnector;
 
@@ -59,7 +59,7 @@ mod private {
 
     pub struct RustlsStream<S>(TlsStream<S>);
 
-    impl<S> tokio_postgres::tls::TlsStream for RustlsStream<S>
+    impl<S> postgres_client::tls::TlsStream for RustlsStream<S>
     where
         S: AsyncRead + AsyncWrite + Unpin,
     {
diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs
index d3f0c3e7d471..42d1491782dd 100644
--- a/proxy/src/proxy/retry.rs
+++ b/proxy/src/proxy/retry.rs
@@ -31,9 +31,9 @@ impl CouldRetry for io::Error {
     }
 }
 
-impl CouldRetry for tokio_postgres::error::DbError {
+impl CouldRetry for postgres_client::error::DbError {
     fn could_retry(&self) -> bool {
-        use tokio_postgres::error::SqlState;
+        use postgres_client::error::SqlState;
         matches!(
             self.code(),
             &SqlState::CONNECTION_FAILURE
@@ -43,9 +43,9 @@ impl CouldRetry for tokio_postgres::error::DbError {
         )
     }
 }
-impl ShouldRetryWakeCompute for tokio_postgres::error::DbError {
+impl ShouldRetryWakeCompute for postgres_client::error::DbError {
     fn should_retry_wake_compute(&self) -> bool {
-        use tokio_postgres::error::SqlState;
+        use postgres_client::error::SqlState;
         // Here are errors that happens after the user successfully authenticated to the database.
         // TODO: there are pgbouncer errors that should be retried, but they are not listed here.
         !matches!(
@@ -61,21 +61,21 @@ impl ShouldRetryWakeCompute for tokio_postgres::error::DbError {
     }
 }
 
-impl CouldRetry for tokio_postgres::Error {
+impl CouldRetry for postgres_client::Error {
     fn could_retry(&self) -> bool {
         if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) {
             io::Error::could_retry(io_err)
         } else if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) {
-            tokio_postgres::error::DbError::could_retry(db_err)
+            postgres_client::error::DbError::could_retry(db_err)
         } else {
             false
         }
     }
 }
-impl ShouldRetryWakeCompute for tokio_postgres::Error {
+impl ShouldRetryWakeCompute for postgres_client::Error {
     fn should_retry_wake_compute(&self) -> bool {
         if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) {
-            tokio_postgres::error::DbError::should_retry_wake_compute(db_err)
+            postgres_client::error::DbError::should_retry_wake_compute(db_err)
         } else {
             // likely an IO error. Possible the compute has shutdown and the
             // cache is stale.
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index fe211adfeb7b..ef351f3b54b2 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -8,9 +8,9 @@ use std::fmt::Debug;
 
 use bytes::{Bytes, BytesMut};
 use futures::{SinkExt, StreamExt};
+use postgres_client::tls::TlsConnect;
 use postgres_protocol::message::frontend;
 use tokio::io::{AsyncReadExt, DuplexStream};
-use tokio_postgres::tls::TlsConnect;
 use tokio_util::codec::{Decoder, Encoder};
 
 use super::*;
@@ -158,8 +158,8 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
         Scram::new("password").await?,
     ));
 
-    let _client_err = tokio_postgres::Config::new()
-        .channel_binding(tokio_postgres::config::ChannelBinding::Disable)
+    let _client_err = postgres_client::Config::new()
+        .channel_binding(postgres_client::config::ChannelBinding::Disable)
         .user("user")
         .dbname("db")
         .password("password")
@@ -175,7 +175,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
 async fn scram_auth_prefer_channel_binding() -> anyhow::Result<()> {
     connect_failure(
         Intercept::None,
-        tokio_postgres::config::ChannelBinding::Prefer,
+        postgres_client::config::ChannelBinding::Prefer,
     )
     .await
 }
@@ -185,7 +185,7 @@ async fn scram_auth_prefer_channel_binding() -> anyhow::Result<()> {
 async fn scram_auth_prefer_channel_binding_intercept() -> anyhow::Result<()> {
     connect_failure(
         Intercept::Methods,
-        tokio_postgres::config::ChannelBinding::Prefer,
+        postgres_client::config::ChannelBinding::Prefer,
     )
     .await
 }
@@ -195,7 +195,7 @@ async fn scram_auth_prefer_channel_binding_intercept() -> anyhow::Result<()> {
 async fn scram_auth_prefer_channel_binding_intercept_response() -> anyhow::Result<()> {
     connect_failure(
         Intercept::SASLResponse,
-        tokio_postgres::config::ChannelBinding::Prefer,
+        postgres_client::config::ChannelBinding::Prefer,
     )
     .await
 }
@@ -205,7 +205,7 @@ async fn scram_auth_prefer_channel_binding_intercept_response() -> anyhow::Resul
 async fn scram_auth_require_channel_binding() -> anyhow::Result<()> {
     connect_failure(
         Intercept::None,
-        tokio_postgres::config::ChannelBinding::Require,
+        postgres_client::config::ChannelBinding::Require,
     )
     .await
 }
@@ -215,7 +215,7 @@ async fn scram_auth_require_channel_binding() -> anyhow::Result<()> {
 async fn scram_auth_require_channel_binding_intercept() -> anyhow::Result<()> {
     connect_failure(
         Intercept::Methods,
-        tokio_postgres::config::ChannelBinding::Require,
+        postgres_client::config::ChannelBinding::Require,
     )
     .await
 }
@@ -225,14 +225,14 @@ async fn scram_auth_require_channel_binding_intercept() -> anyhow::Result<()> {
 async fn scram_auth_require_channel_binding_intercept_response() -> anyhow::Result<()> {
     connect_failure(
         Intercept::SASLResponse,
-        tokio_postgres::config::ChannelBinding::Require,
+        postgres_client::config::ChannelBinding::Require,
     )
     .await
 }
 
 async fn connect_failure(
     intercept: Intercept,
-    channel_binding: tokio_postgres::config::ChannelBinding,
+    channel_binding: postgres_client::config::ChannelBinding,
 ) -> anyhow::Result<()> {
     let (server, client, client_config, server_config) = proxy_mitm(intercept).await;
     let proxy = tokio::spawn(dummy_proxy(
@@ -241,7 +241,7 @@ async fn connect_failure(
         Scram::new("password").await?,
     ));
 
-    let _client_err = tokio_postgres::Config::new()
+    let _client_err = postgres_client::Config::new()
         .channel_binding(channel_binding)
         .user("user")
         .dbname("db")
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index 15be6c9724e8..c8b742b3ff23 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -7,13 +7,13 @@ use std::time::Duration;
 use anyhow::{bail, Context};
 use async_trait::async_trait;
 use http::StatusCode;
+use postgres_client::config::SslMode;
+use postgres_client::tls::{MakeTlsConnect, NoTls};
 use retry::{retry_after, ShouldRetryWakeCompute};
 use rstest::rstest;
 use rustls::crypto::ring;
 use rustls::pki_types;
 use tokio::io::DuplexStream;
-use tokio_postgres::config::SslMode;
-use tokio_postgres::tls::{MakeTlsConnect, NoTls};
 
 use super::connect_compute::ConnectMechanism;
 use super::retry::CouldRetry;
@@ -204,7 +204,7 @@ async fn handshake_tls_is_enforced_by_proxy() -> anyhow::Result<()> {
     let (_, server_config) = generate_tls_config("generic-project-name.localhost", "localhost")?;
     let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth));
 
-    let client_err = tokio_postgres::Config::new()
+    let client_err = postgres_client::Config::new()
         .user("john_doe")
         .dbname("earth")
         .ssl_mode(SslMode::Disable)
@@ -233,7 +233,7 @@ async fn handshake_tls() -> anyhow::Result<()> {
         generate_tls_config("generic-project-name.localhost", "localhost")?;
     let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth));
 
-    let _conn = tokio_postgres::Config::new()
+    let _conn = postgres_client::Config::new()
         .user("john_doe")
         .dbname("earth")
         .ssl_mode(SslMode::Require)
@@ -249,7 +249,7 @@ async fn handshake_raw() -> anyhow::Result<()> {
 
     let proxy = tokio::spawn(dummy_proxy(client, None, NoAuth));
 
-    let _conn = tokio_postgres::Config::new()
+    let _conn = postgres_client::Config::new()
         .user("john_doe")
         .dbname("earth")
         .options("project=generic-project-name")
@@ -296,8 +296,8 @@ async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> {
         Scram::new(password).await?,
     ));
 
-    let _conn = tokio_postgres::Config::new()
-        .channel_binding(tokio_postgres::config::ChannelBinding::Require)
+    let _conn = postgres_client::Config::new()
+        .channel_binding(postgres_client::config::ChannelBinding::Require)
         .user("user")
         .dbname("db")
         .password(password)
@@ -320,8 +320,8 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
         Scram::new("password").await?,
     ));
 
-    let _conn = tokio_postgres::Config::new()
-        .channel_binding(tokio_postgres::config::ChannelBinding::Disable)
+    let _conn = postgres_client::Config::new()
+        .channel_binding(postgres_client::config::ChannelBinding::Disable)
         .user("user")
         .dbname("db")
         .password("password")
@@ -348,7 +348,7 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
         .map(char::from)
         .collect();
 
-    let _client_err = tokio_postgres::Config::new()
+    let _client_err = postgres_client::Config::new()
         .user("user")
         .dbname("db")
         .password(&password) // no password will match the mocked secret
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 57846a4c2c51..8c7931907da5 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -37,9 +37,9 @@ use crate::types::{EndpointId, Host, LOCAL_PROXY_SUFFIX};
 
 pub(crate) struct PoolingBackend {
     pub(crate) http_conn_pool: Arc<GlobalConnPool<Send, HttpConnPool<Send>>>,
-    pub(crate) local_pool: Arc<LocalConnPool<tokio_postgres::Client>>,
+    pub(crate) local_pool: Arc<LocalConnPool<postgres_client::Client>>,
     pub(crate) pool:
-        Arc<GlobalConnPool<tokio_postgres::Client, EndpointConnPool<tokio_postgres::Client>>>,
+        Arc<GlobalConnPool<postgres_client::Client, EndpointConnPool<postgres_client::Client>>>,
 
     pub(crate) config: &'static ProxyConfig,
     pub(crate) auth_backend: &'static crate::auth::Backend<'static, ()>,
@@ -170,7 +170,7 @@ impl PoolingBackend {
         conn_info: ConnInfo,
         keys: ComputeCredentials,
         force_new: bool,
-    ) -> Result<Client<tokio_postgres::Client>, HttpConnError> {
+    ) -> Result<Client<postgres_client::Client>, HttpConnError> {
         let maybe_client = if force_new {
             debug!("pool: pool is disabled");
             None
@@ -256,7 +256,7 @@ impl PoolingBackend {
         &self,
         ctx: &RequestContext,
         conn_info: ConnInfo,
-    ) -> Result<Client<tokio_postgres::Client>, HttpConnError> {
+    ) -> Result<Client<postgres_client::Client>, HttpConnError> {
         if let Some(client) = self.local_pool.get(ctx, &conn_info)? {
             return Ok(client);
         }
@@ -315,7 +315,7 @@ impl PoolingBackend {
             ));
 
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
-        let (client, connection) = config.connect(tokio_postgres::NoTls).await?;
+        let (client, connection) = config.connect(postgres_client::NoTls).await?;
         drop(pause);
 
         let pid = client.get_process_id();
@@ -360,7 +360,7 @@ pub(crate) enum HttpConnError {
     #[error("pooled connection closed at inconsistent state")]
     ConnectionClosedAbruptly(#[from] tokio::sync::watch::error::SendError<uuid::Uuid>),
     #[error("could not connection to postgres in compute")]
-    PostgresConnectionError(#[from] tokio_postgres::Error),
+    PostgresConnectionError(#[from] postgres_client::Error),
     #[error("could not connection to local-proxy in compute")]
     LocalProxyConnectionError(#[from] LocalProxyConnError),
     #[error("could not parse JWT payload")]
@@ -479,7 +479,7 @@ impl ShouldRetryWakeCompute for LocalProxyConnError {
 }
 
 struct TokioMechanism {
-    pool: Arc<GlobalConnPool<tokio_postgres::Client, EndpointConnPool<tokio_postgres::Client>>>,
+    pool: Arc<GlobalConnPool<postgres_client::Client, EndpointConnPool<postgres_client::Client>>>,
     conn_info: ConnInfo,
     conn_id: uuid::Uuid,
 
@@ -489,7 +489,7 @@ struct TokioMechanism {
 
 #[async_trait]
 impl ConnectMechanism for TokioMechanism {
-    type Connection = Client<tokio_postgres::Client>;
+    type Connection = Client<postgres_client::Client>;
     type ConnectError = HttpConnError;
     type Error = HttpConnError;
 
@@ -509,7 +509,7 @@ impl ConnectMechanism for TokioMechanism {
             .connect_timeout(timeout);
 
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
-        let res = config.connect(tokio_postgres::NoTls).await;
+        let res = config.connect(postgres_client::NoTls).await;
         drop(pause);
         let (client, connection) = permit.release_result(res)?;
 
diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs
index c302eac5684b..cac5a173cb16 100644
--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -5,11 +5,11 @@ use std::task::{ready, Poll};
 
 use futures::future::poll_fn;
 use futures::Future;
+use postgres_client::tls::NoTlsStream;
+use postgres_client::AsyncMessage;
 use smallvec::SmallVec;
 use tokio::net::TcpStream;
 use tokio::time::Instant;
-use tokio_postgres::tls::NoTlsStream;
-use tokio_postgres::AsyncMessage;
 use tokio_util::sync::CancellationToken;
 use tracing::{error, info, info_span, warn, Instrument};
 #[cfg(test)]
@@ -58,7 +58,7 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
     ctx: &RequestContext,
     conn_info: ConnInfo,
     client: C,
-    mut connection: tokio_postgres::Connection<TcpStream, NoTlsStream>,
+    mut connection: postgres_client::Connection<TcpStream, NoTlsStream>,
     conn_id: uuid::Uuid,
     aux: MetricsAuxInfo,
 ) -> Client<C> {
diff --git a/proxy/src/serverless/conn_pool_lib.rs b/proxy/src/serverless/conn_pool_lib.rs
index fe1d2563bca1..2a46c8f9c5cf 100644
--- a/proxy/src/serverless/conn_pool_lib.rs
+++ b/proxy/src/serverless/conn_pool_lib.rs
@@ -7,8 +7,8 @@ use std::time::Duration;
 
 use dashmap::DashMap;
 use parking_lot::RwLock;
+use postgres_client::ReadyForQueryStatus;
 use rand::Rng;
-use tokio_postgres::ReadyForQueryStatus;
 use tracing::{debug, info, Span};
 
 use super::backend::HttpConnError;
@@ -683,7 +683,7 @@ pub(crate) trait ClientInnerExt: Sync + Send + 'static {
     fn get_process_id(&self) -> i32;
 }
 
-impl ClientInnerExt for tokio_postgres::Client {
+impl ClientInnerExt for postgres_client::Client {
     fn is_closed(&self) -> bool {
         self.is_closed()
     }
diff --git a/proxy/src/serverless/json.rs b/proxy/src/serverless/json.rs
index 569e2da5715a..25b25c66d3fb 100644
--- a/proxy/src/serverless/json.rs
+++ b/proxy/src/serverless/json.rs
@@ -1,6 +1,6 @@
+use postgres_client::types::{Kind, Type};
+use postgres_client::Row;
 use serde_json::{Map, Value};
-use tokio_postgres::types::{Kind, Type};
-use tokio_postgres::Row;
 
 //
 // Convert json non-string types to strings, so that they can be passed to Postgres
@@ -61,7 +61,7 @@ fn json_array_to_pg_array(value: &Value) -> Option<String> {
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum JsonConversionError {
     #[error("internal error compute returned invalid data: {0}")]
-    AsTextError(tokio_postgres::Error),
+    AsTextError(postgres_client::Error),
     #[error("parse int error: {0}")]
     ParseIntError(#[from] std::num::ParseIntError),
     #[error("parse float error: {0}")]
diff --git a/proxy/src/serverless/local_conn_pool.rs b/proxy/src/serverless/local_conn_pool.rs
index db9ac49dae8f..b84cde9e252a 100644
--- a/proxy/src/serverless/local_conn_pool.rs
+++ b/proxy/src/serverless/local_conn_pool.rs
@@ -22,13 +22,13 @@ use indexmap::IndexMap;
 use jose_jwk::jose_b64::base64ct::{Base64UrlUnpadded, Encoding};
 use p256::ecdsa::{Signature, SigningKey};
 use parking_lot::RwLock;
+use postgres_client::tls::NoTlsStream;
+use postgres_client::types::ToSql;
+use postgres_client::AsyncMessage;
 use serde_json::value::RawValue;
 use signature::Signer;
 use tokio::net::TcpStream;
 use tokio::time::Instant;
-use tokio_postgres::tls::NoTlsStream;
-use tokio_postgres::types::ToSql;
-use tokio_postgres::AsyncMessage;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, info_span, warn, Instrument};
 
@@ -164,7 +164,7 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
     ctx: &RequestContext,
     conn_info: ConnInfo,
     client: C,
-    mut connection: tokio_postgres::Connection<TcpStream, NoTlsStream>,
+    mut connection: postgres_client::Connection<TcpStream, NoTlsStream>,
     key: SigningKey,
     conn_id: uuid::Uuid,
     aux: MetricsAuxInfo,
@@ -280,7 +280,7 @@ pub(crate) fn poll_client<C: ClientInnerExt>(
     )
 }
 
-impl ClientInnerCommon<tokio_postgres::Client> {
+impl ClientInnerCommon<postgres_client::Client> {
     pub(crate) async fn set_jwt_session(&mut self, payload: &[u8]) -> Result<(), HttpConnError> {
         if let ClientDataEnum::Local(local_data) = &mut self.data {
             local_data.jti += 1;
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index a0ca7cc60d6a..5e85f5ec4019 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -11,12 +11,12 @@ use http_body_util::{BodyExt, Full};
 use hyper::body::Incoming;
 use hyper::http::{HeaderName, HeaderValue};
 use hyper::{header, HeaderMap, Request, Response, StatusCode};
+use postgres_client::error::{DbError, ErrorPosition, SqlState};
+use postgres_client::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction};
 use pq_proto::StartupMessageParamsBuilder;
 use serde::Serialize;
 use serde_json::Value;
 use tokio::time::{self, Instant};
-use tokio_postgres::error::{DbError, ErrorPosition, SqlState};
-use tokio_postgres::{GenericClient, IsolationLevel, NoTls, ReadyForQueryStatus, Transaction};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info};
 use typed_json::json;
@@ -361,7 +361,7 @@ pub(crate) enum SqlOverHttpError {
     #[error("invalid isolation level")]
     InvalidIsolationLevel,
     #[error("{0}")]
-    Postgres(#[from] tokio_postgres::Error),
+    Postgres(#[from] postgres_client::Error),
     #[error("{0}")]
     JsonConversion(#[from] JsonConversionError),
     #[error("{0}")]
@@ -986,7 +986,7 @@ async fn query_to_json<T: GenericClient>(
     // Manually drain the stream into a vector to leave row_stream hanging
     // around to get a command tag. Also check that the response is not too
     // big.
-    let mut rows: Vec<tokio_postgres::Row> = Vec::new();
+    let mut rows: Vec<postgres_client::Row> = Vec::new();
     while let Some(row) = row_stream.next().await {
         let row = row?;
         *current_size += row.body_len();
@@ -1063,13 +1063,13 @@ async fn query_to_json<T: GenericClient>(
 }
 
 enum Client {
-    Remote(conn_pool_lib::Client<tokio_postgres::Client>),
-    Local(conn_pool_lib::Client<tokio_postgres::Client>),
+    Remote(conn_pool_lib::Client<postgres_client::Client>),
+    Local(conn_pool_lib::Client<postgres_client::Client>),
 }
 
 enum Discard<'a> {
-    Remote(conn_pool_lib::Discard<'a, tokio_postgres::Client>),
-    Local(conn_pool_lib::Discard<'a, tokio_postgres::Client>),
+    Remote(conn_pool_lib::Discard<'a, postgres_client::Client>),
+    Local(conn_pool_lib::Discard<'a, postgres_client::Client>),
 }
 
 impl Client {
@@ -1080,7 +1080,7 @@ impl Client {
         }
     }
 
-    fn inner(&mut self) -> (&mut tokio_postgres::Client, Discard<'_>) {
+    fn inner(&mut self) -> (&mut postgres_client::Client, Discard<'_>) {
         match self {
             Client::Remote(client) => {
                 let (c, d) = client.inner();

From f312c6571f45395f4a5adfb2b0450741c16ebd58 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Tue, 3 Dec 2024 19:47:17 +0100
Subject: [PATCH 23/65] pageserver: respond to multiple shutdown signals
 (#9982)

## Problem

The Pageserver signal handler would only respond to a single signal and
initiate shutdown. Subsequent signals were ignored. This meant that a
`SIGQUIT` sent after a `SIGTERM` had no effect (e.g. in the case of a
slow or stalled shutdown). The `test_runner` uses this to force shutdown
if graceful shutdown is slow.

Touches #9740.

## Summary of changes

Keep responding to signals after the initial shutdown signal has been
received.

Arguably, the `test_runner` should also use `SIGKILL` rather than
`SIGQUIT` in this case, but it seems reasonable to respond to `SIGQUIT`
regardless.
---
 pageserver/src/bin/pageserver.rs | 82 +++++++++++++++++++-------------
 1 file changed, 48 insertions(+), 34 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 8fe225c6aa90..567a69da3b12 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -636,45 +636,59 @@ fn start_pageserver(
         tokio::net::TcpListener::from_std(pageserver_listener).context("create tokio listener")?
     });
 
-    let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
-
     // All started up! Now just sit and wait for shutdown signal.
-
-    {
-        BACKGROUND_RUNTIME.block_on(async move {
+    BACKGROUND_RUNTIME.block_on(async move {
+        let signal_token = CancellationToken::new();
+        let signal_cancel = signal_token.child_token();
+
+        // Spawn signal handlers. Runs in a loop since we want to be responsive to multiple signals
+        // even after triggering shutdown (e.g. a SIGQUIT after a slow SIGTERM shutdown). See:
+        // https://github.com/neondatabase/neon/issues/9740.
+        tokio::spawn(async move {
             let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt()).unwrap();
             let mut sigterm = tokio::signal::unix::signal(SignalKind::terminate()).unwrap();
             let mut sigquit = tokio::signal::unix::signal(SignalKind::quit()).unwrap();
-            let signal = tokio::select! {
-                _ = sigquit.recv() => {
-                    info!("Got signal SIGQUIT. Terminating in immediate shutdown mode",);
-                    std::process::exit(111);
+
+            loop {
+                let signal = tokio::select! {
+                    _ = sigquit.recv() => {
+                        info!("Got signal SIGQUIT. Terminating in immediate shutdown mode.");
+                        std::process::exit(111);
+                    }
+                    _ = sigint.recv() => "SIGINT",
+                    _ = sigterm.recv() => "SIGTERM",
+                };
+
+                if !signal_token.is_cancelled() {
+                    info!("Got signal {signal}. Terminating gracefully in fast shutdown mode.");
+                    signal_token.cancel();
+                } else {
+                    info!("Got signal {signal}. Already shutting down.");
                 }
-                _ = sigint.recv() => { "SIGINT" },
-                _ = sigterm.recv() => { "SIGTERM" },
-            };
-
-            info!("Got signal {signal}. Terminating gracefully in fast shutdown mode",);
-
-            // This cancels the `shutdown_pageserver` cancellation tree.
-            // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
-            // The plan is to change that over time.
-            shutdown_pageserver.take();
-            pageserver::shutdown_pageserver(
-                http_endpoint_listener,
-                page_service,
-                consumption_metrics_tasks,
-                disk_usage_eviction_task,
-                &tenant_manager,
-                background_purges,
-                deletion_queue.clone(),
-                secondary_controller_tasks,
-                0,
-            )
-            .await;
-            unreachable!()
-        })
-    }
+            }
+        });
+
+        // Wait for cancellation signal and shut down the pageserver.
+        //
+        // This cancels the `shutdown_pageserver` cancellation tree. Right now that tree doesn't
+        // reach very far, and `task_mgr` is used instead. The plan is to change that over time.
+        signal_cancel.cancelled().await;
+
+        shutdown_pageserver.cancel();
+        pageserver::shutdown_pageserver(
+            http_endpoint_listener,
+            page_service,
+            consumption_metrics_tasks,
+            disk_usage_eviction_task,
+            &tenant_manager,
+            background_purges,
+            deletion_queue.clone(),
+            secondary_controller_tasks,
+            0,
+        )
+        .await;
+        unreachable!();
+    })
 }
 
 async fn create_remote_storage_client(

From 3baef0bca3e9217519f72734c773a6a1f880c90f Mon Sep 17 00:00:00 2001
From: Alexey Immoreev <lexx92@mail.ru>
Date: Tue, 3 Dec 2024 22:59:44 +0400
Subject: [PATCH 24/65] Improvement: add console redirect timeout warning
 (#9985)

## Problem

There is no information on session being cancelled in 2 minutes at the
moment

## Summary of changes

The timeout being logged for the user
---
 proxy/src/auth/backend/console_redirect.rs | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs
index 494564de05f0..619c7b4ef112 100644
--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -49,13 +49,19 @@ impl ReportableError for ConsoleRedirectError {
     }
 }
 
-fn hello_message(redirect_uri: &reqwest::Url, session_id: &str) -> String {
+fn hello_message(
+    redirect_uri: &reqwest::Url,
+    session_id: &str,
+    duration: std::time::Duration,
+) -> String {
+    let formatted_duration = humantime::format_duration(duration).to_string();
     format!(
         concat![
             "Welcome to Neon!\n",
-            "Authenticate by visiting:\n",
+            "Authenticate by visiting (will expire in {duration}):\n",
             "    {redirect_uri}{session_id}\n\n",
         ],
+        duration = formatted_duration,
         redirect_uri = redirect_uri,
         session_id = session_id,
     )
@@ -118,7 +124,11 @@ async fn authenticate(
     };
 
     let span = info_span!("console_redirect", psql_session_id = &psql_session_id);
-    let greeting = hello_message(link_uri, &psql_session_id);
+    let greeting = hello_message(
+        link_uri,
+        &psql_session_id,
+        auth_config.console_redirect_confirmation_timeout,
+    );
 
     // Give user a URL to spawn a new database.
     info!(parent: &span, "sending the auth URL to the user");

From 9ef0662a42585aa20a68db9243b7623cc5bd6c56 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 3 Dec 2024 20:00:14 +0000
Subject: [PATCH 25/65] chore(proxy): enforce single host+port (#9995)

proxy doesn't ever provide multiple hosts/ports, so this code adds a lot
of complexity of error handling for no good reason.

(stacked on #9990)
---
 libs/proxy/tokio-postgres2/src/config.rs   | 41 ++++-----------
 libs/proxy/tokio-postgres2/src/connect.rs  | 38 ++++----------
 proxy/src/auth/backend/console_redirect.rs |  8 +--
 proxy/src/auth/backend/local.rs            |  7 +--
 proxy/src/compute.rs                       | 61 ++++++----------------
 proxy/src/control_plane/client/mock.rs     | 10 ++--
 proxy/src/control_plane/client/neon.rs     |  4 +-
 proxy/src/proxy/connect_compute.rs         |  2 +-
 proxy/src/proxy/tests/mitm.rs              |  4 +-
 proxy/src/proxy/tests/mod.rs               | 14 ++---
 proxy/src/serverless/backend.rs            | 10 ++--
 11 files changed, 58 insertions(+), 141 deletions(-)

diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs
index 5dad835c3bdd..fd10ef6f207d 100644
--- a/libs/proxy/tokio-postgres2/src/config.rs
+++ b/libs/proxy/tokio-postgres2/src/config.rs
@@ -146,6 +146,9 @@ pub enum AuthKeys {
 /// ```
 #[derive(Clone, PartialEq, Eq)]
 pub struct Config {
+    pub(crate) host: Host,
+    pub(crate) port: u16,
+
     pub(crate) user: Option<String>,
     pub(crate) password: Option<Vec<u8>>,
     pub(crate) auth_keys: Option<Box<AuthKeys>>,
@@ -153,8 +156,6 @@ pub struct Config {
     pub(crate) options: Option<String>,
     pub(crate) application_name: Option<String>,
     pub(crate) ssl_mode: SslMode,
-    pub(crate) host: Vec<Host>,
-    pub(crate) port: Vec<u16>,
     pub(crate) connect_timeout: Option<Duration>,
     pub(crate) target_session_attrs: TargetSessionAttrs,
     pub(crate) channel_binding: ChannelBinding,
@@ -162,16 +163,12 @@ pub struct Config {
     pub(crate) max_backend_message_size: Option<usize>,
 }
 
-impl Default for Config {
-    fn default() -> Config {
-        Config::new()
-    }
-}
-
 impl Config {
     /// Creates a new configuration.
-    pub fn new() -> Config {
+    pub fn new(host: String, port: u16) -> Config {
         Config {
+            host: Host::Tcp(host),
+            port,
             user: None,
             password: None,
             auth_keys: None,
@@ -179,8 +176,6 @@ impl Config {
             options: None,
             application_name: None,
             ssl_mode: SslMode::Prefer,
-            host: vec![],
-            port: vec![],
             connect_timeout: None,
             target_session_attrs: TargetSessionAttrs::Any,
             channel_binding: ChannelBinding::Prefer,
@@ -283,32 +278,14 @@ impl Config {
         self.ssl_mode
     }
 
-    /// Adds a host to the configuration.
-    ///
-    /// Multiple hosts can be specified by calling this method multiple times, and each will be tried in order.
-    pub fn host(&mut self, host: &str) -> &mut Config {
-        self.host.push(Host::Tcp(host.to_string()));
-        self
-    }
-
     /// Gets the hosts that have been added to the configuration with `host`.
-    pub fn get_hosts(&self) -> &[Host] {
+    pub fn get_host(&self) -> &Host {
         &self.host
     }
 
-    /// Adds a port to the configuration.
-    ///
-    /// Multiple ports can be specified by calling this method multiple times. There must either be no ports, in which
-    /// case the default of 5432 is used, a single port, in which it is used for all hosts, or the same number of ports
-    /// as hosts.
-    pub fn port(&mut self, port: u16) -> &mut Config {
-        self.port.push(port);
-        self
-    }
-
     /// Gets the ports that have been added to the configuration with `port`.
-    pub fn get_ports(&self) -> &[u16] {
-        &self.port
+    pub fn get_port(&self) -> u16 {
+        self.port
     }
 
     /// Sets the timeout applied to socket-level connection attempts.
diff --git a/libs/proxy/tokio-postgres2/src/connect.rs b/libs/proxy/tokio-postgres2/src/connect.rs
index 98067d91f942..75a58e6eacc9 100644
--- a/libs/proxy/tokio-postgres2/src/connect.rs
+++ b/libs/proxy/tokio-postgres2/src/connect.rs
@@ -19,38 +19,18 @@ pub async fn connect<T>(
 where
     T: MakeTlsConnect<TcpStream>,
 {
-    if config.host.is_empty() {
-        return Err(Error::config("host missing".into()));
-    }
-
-    if config.port.len() > 1 && config.port.len() != config.host.len() {
-        return Err(Error::config("invalid number of ports".into()));
-    }
-
-    let mut error = None;
-    for (i, host) in config.host.iter().enumerate() {
-        let port = config
-            .port
-            .get(i)
-            .or_else(|| config.port.first())
-            .copied()
-            .unwrap_or(5432);
-
-        let hostname = match host {
-            Host::Tcp(host) => host.as_str(),
-        };
+    let hostname = match &config.host {
+        Host::Tcp(host) => host.as_str(),
+    };
 
-        let tls = tls
-            .make_tls_connect(hostname)
-            .map_err(|e| Error::tls(e.into()))?;
+    let tls = tls
+        .make_tls_connect(hostname)
+        .map_err(|e| Error::tls(e.into()))?;
 
-        match connect_once(host, port, tls, config).await {
-            Ok((client, connection)) => return Ok((client, connection)),
-            Err(e) => error = Some(e),
-        }
+    match connect_once(&config.host, config.port, tls, config).await {
+        Ok((client, connection)) => Ok((client, connection)),
+        Err(e) => Err(e),
     }
-
-    Err(error.unwrap())
 }
 
 async fn connect_once<T>(
diff --git a/proxy/src/auth/backend/console_redirect.rs b/proxy/src/auth/backend/console_redirect.rs
index 619c7b4ef112..575d60be8559 100644
--- a/proxy/src/auth/backend/console_redirect.rs
+++ b/proxy/src/auth/backend/console_redirect.rs
@@ -161,12 +161,8 @@ async fn authenticate(
 
     // This config should be self-contained, because we won't
     // take username or dbname from client's startup message.
-    let mut config = compute::ConnCfg::new();
-    config
-        .host(&db_info.host)
-        .port(db_info.port)
-        .dbname(&db_info.dbname)
-        .user(&db_info.user);
+    let mut config = compute::ConnCfg::new(db_info.host.to_string(), db_info.port);
+    config.dbname(&db_info.dbname).user(&db_info.user);
 
     ctx.set_dbname(db_info.dbname.into());
     ctx.set_user(db_info.user.into());
diff --git a/proxy/src/auth/backend/local.rs b/proxy/src/auth/backend/local.rs
index 32e0f536153d..d4273fb52167 100644
--- a/proxy/src/auth/backend/local.rs
+++ b/proxy/src/auth/backend/local.rs
@@ -29,12 +29,7 @@ impl LocalBackend {
                 api: http::Endpoint::new(compute_ctl, http::new_client()),
             },
             node_info: NodeInfo {
-                config: {
-                    let mut cfg = ConnCfg::new();
-                    cfg.host(&postgres_addr.ip().to_string());
-                    cfg.port(postgres_addr.port());
-                    cfg
-                },
+                config: ConnCfg::new(postgres_addr.ip().to_string(), postgres_addr.port()),
                 // TODO(conrad): make this better reflect compute info rather than endpoint info.
                 aux: MetricsAuxInfo {
                     endpoint_id: EndpointIdTag::get_interner().get_or_intern("local"),
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 06bc71c55988..ab0ff4b7950a 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -104,13 +104,13 @@ pub(crate) type ScramKeys = postgres_client::config::ScramKeys<32>;
 /// A config for establishing a connection to compute node.
 /// Eventually, `postgres_client` will be replaced with something better.
 /// Newtype allows us to implement methods on top of it.
-#[derive(Clone, Default)]
+#[derive(Clone)]
 pub(crate) struct ConnCfg(Box<postgres_client::Config>);
 
 /// Creation and initialization routines.
 impl ConnCfg {
-    pub(crate) fn new() -> Self {
-        Self::default()
+    pub(crate) fn new(host: String, port: u16) -> Self {
+        Self(Box::new(postgres_client::Config::new(host, port)))
     }
 
     /// Reuse password or auth keys from the other config.
@@ -124,13 +124,9 @@ impl ConnCfg {
         }
     }
 
-    pub(crate) fn get_host(&self) -> Result<Host, WakeComputeError> {
-        match self.0.get_hosts() {
-            [postgres_client::config::Host::Tcp(s)] => Ok(s.into()),
-            // we should not have multiple address or unix addresses.
-            _ => Err(WakeComputeError::BadComputeAddress(
-                "invalid compute address".into(),
-            )),
+    pub(crate) fn get_host(&self) -> Host {
+        match self.0.get_host() {
+            postgres_client::config::Host::Tcp(s) => s.into(),
         }
     }
 
@@ -227,43 +223,20 @@ impl ConnCfg {
         // We can't reuse connection establishing logic from `postgres_client` here,
         // because it has no means for extracting the underlying socket which we
         // require for our business.
-        let mut connection_error = None;
-        let ports = self.0.get_ports();
-        let hosts = self.0.get_hosts();
-        // the ports array is supposed to have 0 entries, 1 entry, or as many entries as in the hosts array
-        if ports.len() > 1 && ports.len() != hosts.len() {
-            return Err(io::Error::new(
-                io::ErrorKind::Other,
-                format!(
-                    "bad compute config, \
-                     ports and hosts entries' count does not match: {:?}",
-                    self.0
-                ),
-            ));
-        }
+        let port = self.0.get_port();
+        let host = self.0.get_host();
 
-        for (i, host) in hosts.iter().enumerate() {
-            let port = ports.get(i).or_else(|| ports.first()).unwrap_or(&5432);
-            let host = match host {
-                Host::Tcp(host) => host.as_str(),
-            };
-
-            match connect_once(host, *port).await {
-                Ok((sockaddr, stream)) => return Ok((sockaddr, stream, host)),
-                Err(err) => {
-                    // We can't throw an error here, as there might be more hosts to try.
-                    warn!("couldn't connect to compute node at {host}:{port}: {err}");
-                    connection_error = Some(err);
-                }
+        let host = match host {
+            Host::Tcp(host) => host.as_str(),
+        };
+
+        match connect_once(host, port).await {
+            Ok((sockaddr, stream)) => Ok((sockaddr, stream, host)),
+            Err(err) => {
+                warn!("couldn't connect to compute node at {host}:{port}: {err}");
+                Err(err)
             }
         }
-
-        Err(connection_error.unwrap_or_else(|| {
-            io::Error::new(
-                io::ErrorKind::Other,
-                format!("bad compute config: {:?}", self.0),
-            )
-        }))
     }
 }
 
diff --git a/proxy/src/control_plane/client/mock.rs b/proxy/src/control_plane/client/mock.rs
index 4d55f96ca198..eaf692ab279b 100644
--- a/proxy/src/control_plane/client/mock.rs
+++ b/proxy/src/control_plane/client/mock.rs
@@ -160,11 +160,11 @@ impl MockControlPlane {
     }
 
     async fn do_wake_compute(&self) -> Result<NodeInfo, WakeComputeError> {
-        let mut config = compute::ConnCfg::new();
-        config
-            .host(self.endpoint.host_str().unwrap_or("localhost"))
-            .port(self.endpoint.port().unwrap_or(5432))
-            .ssl_mode(postgres_client::config::SslMode::Disable);
+        let mut config = compute::ConnCfg::new(
+            self.endpoint.host_str().unwrap_or("localhost").to_owned(),
+            self.endpoint.port().unwrap_or(5432),
+        );
+        config.ssl_mode(postgres_client::config::SslMode::Disable);
 
         let node = NodeInfo {
             config,
diff --git a/proxy/src/control_plane/client/neon.rs b/proxy/src/control_plane/client/neon.rs
index 5a78ec9d32aa..5c204ae1d700 100644
--- a/proxy/src/control_plane/client/neon.rs
+++ b/proxy/src/control_plane/client/neon.rs
@@ -241,8 +241,8 @@ impl NeonControlPlaneClient {
             // Don't set anything but host and port! This config will be cached.
             // We'll set username and such later using the startup message.
             // TODO: add more type safety (in progress).
-            let mut config = compute::ConnCfg::new();
-            config.host(host).port(port).ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
+            let mut config = compute::ConnCfg::new(host.to_owned(), port);
+            config.ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
 
             let node = NodeInfo {
                 config,
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 2e759b0894a2..585dce7baeb8 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -86,7 +86,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
         node_info: &control_plane::CachedNodeInfo,
         timeout: time::Duration,
     ) -> Result<PostgresConnection, Self::Error> {
-        let host = node_info.config.get_host()?;
+        let host = node_info.config.get_host();
         let permit = self.locks.get_permit(&host).await?;
         permit.release_result(node_info.connect(ctx, timeout).await)
     }
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index ef351f3b54b2..d72331c7bf78 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -158,7 +158,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
         Scram::new("password").await?,
     ));
 
-    let _client_err = postgres_client::Config::new()
+    let _client_err = postgres_client::Config::new("test".to_owned(), 5432)
         .channel_binding(postgres_client::config::ChannelBinding::Disable)
         .user("user")
         .dbname("db")
@@ -241,7 +241,7 @@ async fn connect_failure(
         Scram::new("password").await?,
     ));
 
-    let _client_err = postgres_client::Config::new()
+    let _client_err = postgres_client::Config::new("test".to_owned(), 5432)
         .channel_binding(channel_binding)
         .user("user")
         .dbname("db")
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index c8b742b3ff23..53345431e3cf 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -204,7 +204,7 @@ async fn handshake_tls_is_enforced_by_proxy() -> anyhow::Result<()> {
     let (_, server_config) = generate_tls_config("generic-project-name.localhost", "localhost")?;
     let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth));
 
-    let client_err = postgres_client::Config::new()
+    let client_err = postgres_client::Config::new("test".to_owned(), 5432)
         .user("john_doe")
         .dbname("earth")
         .ssl_mode(SslMode::Disable)
@@ -233,7 +233,7 @@ async fn handshake_tls() -> anyhow::Result<()> {
         generate_tls_config("generic-project-name.localhost", "localhost")?;
     let proxy = tokio::spawn(dummy_proxy(client, Some(server_config), NoAuth));
 
-    let _conn = postgres_client::Config::new()
+    let _conn = postgres_client::Config::new("test".to_owned(), 5432)
         .user("john_doe")
         .dbname("earth")
         .ssl_mode(SslMode::Require)
@@ -249,7 +249,7 @@ async fn handshake_raw() -> anyhow::Result<()> {
 
     let proxy = tokio::spawn(dummy_proxy(client, None, NoAuth));
 
-    let _conn = postgres_client::Config::new()
+    let _conn = postgres_client::Config::new("test".to_owned(), 5432)
         .user("john_doe")
         .dbname("earth")
         .options("project=generic-project-name")
@@ -296,7 +296,7 @@ async fn scram_auth_good(#[case] password: &str) -> anyhow::Result<()> {
         Scram::new(password).await?,
     ));
 
-    let _conn = postgres_client::Config::new()
+    let _conn = postgres_client::Config::new("test".to_owned(), 5432)
         .channel_binding(postgres_client::config::ChannelBinding::Require)
         .user("user")
         .dbname("db")
@@ -320,7 +320,7 @@ async fn scram_auth_disable_channel_binding() -> anyhow::Result<()> {
         Scram::new("password").await?,
     ));
 
-    let _conn = postgres_client::Config::new()
+    let _conn = postgres_client::Config::new("test".to_owned(), 5432)
         .channel_binding(postgres_client::config::ChannelBinding::Disable)
         .user("user")
         .dbname("db")
@@ -348,7 +348,7 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
         .map(char::from)
         .collect();
 
-    let _client_err = postgres_client::Config::new()
+    let _client_err = postgres_client::Config::new("test".to_owned(), 5432)
         .user("user")
         .dbname("db")
         .password(&password) // no password will match the mocked secret
@@ -546,7 +546,7 @@ impl TestControlPlaneClient for TestConnectMechanism {
 
 fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo {
     let node = NodeInfo {
-        config: compute::ConnCfg::new(),
+        config: compute::ConnCfg::new("test".to_owned(), 5432),
         aux: MetricsAuxInfo {
             endpoint_id: (&EndpointId::from("endpoint")).into(),
             project_id: (&ProjectId::from("project")).into(),
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 8c7931907da5..55d2e47fd3f2 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -499,7 +499,7 @@ impl ConnectMechanism for TokioMechanism {
         node_info: &CachedNodeInfo,
         timeout: Duration,
     ) -> Result<Self::Connection, Self::ConnectError> {
-        let host = node_info.config.get_host()?;
+        let host = node_info.config.get_host();
         let permit = self.locks.get_permit(&host).await?;
 
         let mut config = (*node_info.config).clone();
@@ -549,16 +549,12 @@ impl ConnectMechanism for HyperMechanism {
         node_info: &CachedNodeInfo,
         timeout: Duration,
     ) -> Result<Self::Connection, Self::ConnectError> {
-        let host = node_info.config.get_host()?;
+        let host = node_info.config.get_host();
         let permit = self.locks.get_permit(&host).await?;
 
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
 
-        let port = *node_info.config.get_ports().first().ok_or_else(|| {
-            HttpConnError::WakeCompute(WakeComputeError::BadComputeAddress(
-                "local-proxy port missing on compute address".into(),
-            ))
-        })?;
+        let port = node_info.config.get_port();
         let res = connect_http2(&host, port, timeout).await;
         drop(pause);
         let (client, connection) = permit.release_result(res)?;

From ca85f364ba3fd0ed41c2be9995722725cbaee78f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Tue, 3 Dec 2024 21:39:10 +0100
Subject: [PATCH 26/65] Support tenant manifests in the scrubber (#9942)

Support tenant manifests in the storage scrubber:

* list the manifests, order them by generation
* delete all manifests except for the two most recent generations
* for the latest manifest: try parsing it.

I've tested this patch by running the against a staging bucket and it
successfully deleted stuff (and avoided deleting the latest two
generations).

In follow-up work, we might want to also check some invariants of the
manifest, as mentioned in #8088.

Part of #9386
Part of #8088

---------

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 .../src/tenant/remote_timeline_client.rs      |   4 +-
 .../tenant/remote_timeline_client/manifest.rs |   2 +-
 storage_scrubber/src/checks.rs                | 135 ++++++++-
 .../src/pageserver_physical_gc.rs             | 258 ++++++++++++++----
 test_runner/regress/test_timeline_archive.py  | 114 ++++++++
 5 files changed, 459 insertions(+), 54 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 007bd3eef083..4bb1bbf3cfd5 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -2564,9 +2564,9 @@ pub fn parse_remote_index_path(path: RemotePath) -> Option<Generation> {
 }
 
 /// Given the key of a tenant manifest, parse out the generation number
-pub(crate) fn parse_remote_tenant_manifest_path(path: RemotePath) -> Option<Generation> {
+pub fn parse_remote_tenant_manifest_path(path: RemotePath) -> Option<Generation> {
     static RE: OnceLock<Regex> = OnceLock::new();
-    let re = RE.get_or_init(|| Regex::new(r".+tenant-manifest-([0-9a-f]{8}).json").unwrap());
+    let re = RE.get_or_init(|| Regex::new(r".*tenant-manifest-([0-9a-f]{8}).json").unwrap());
     re.captures(path.get_path().as_str())
         .and_then(|c| c.get(1))
         .and_then(|m| Generation::parse_suffix(m.as_str()))
diff --git a/pageserver/src/tenant/remote_timeline_client/manifest.rs b/pageserver/src/tenant/remote_timeline_client/manifest.rs
index c4382cb6480f..2029847a1249 100644
--- a/pageserver/src/tenant/remote_timeline_client/manifest.rs
+++ b/pageserver/src/tenant/remote_timeline_client/manifest.rs
@@ -43,7 +43,7 @@ impl TenantManifest {
             offloaded_timelines: vec![],
         }
     }
-    pub(crate) fn from_json_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
+    pub fn from_json_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
         serde_json::from_slice::<Self>(bytes)
     }
 
diff --git a/storage_scrubber/src/checks.rs b/storage_scrubber/src/checks.rs
index 8d855d263cfd..1b4ff01a170a 100644
--- a/storage_scrubber/src/checks.rs
+++ b/storage_scrubber/src/checks.rs
@@ -4,17 +4,21 @@ use itertools::Itertools;
 use pageserver::tenant::checks::check_valid_layermap;
 use pageserver::tenant::layer_map::LayerMap;
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
+use pageserver::tenant::remote_timeline_client::manifest::TenantManifest;
 use pageserver_api::shard::ShardIndex;
 use tokio_util::sync::CancellationToken;
 use tracing::{info, warn};
 use utils::generation::Generation;
 use utils::id::TimelineId;
+use utils::shard::TenantShardId;
 
 use crate::cloud_admin_api::BranchData;
 use crate::metadata_stream::stream_listing;
 use crate::{download_object_with_retries, RootTarget, TenantShardTimelineId};
 use futures_util::StreamExt;
-use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path};
+use pageserver::tenant::remote_timeline_client::{
+    parse_remote_index_path, parse_remote_tenant_manifest_path, remote_layer_path,
+};
 use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::IndexPart;
 use remote_storage::{GenericRemoteStorage, ListingObject, RemotePath};
@@ -527,3 +531,132 @@ async fn list_timeline_blobs_impl(
         unknown_keys,
     }))
 }
+
+pub(crate) struct RemoteTenantManifestInfo {
+    pub(crate) latest_generation: Option<Generation>,
+    pub(crate) manifests: Vec<(Generation, ListingObject)>,
+}
+
+pub(crate) enum ListTenantManifestResult {
+    WithErrors {
+        errors: Vec<(String, String)>,
+        #[allow(dead_code)]
+        unknown_keys: Vec<ListingObject>,
+    },
+    NoErrors(RemoteTenantManifestInfo),
+}
+
+/// Lists the tenant manifests in remote storage and parses the latest one, returning a [`ListTenantManifestResult`] object.
+pub(crate) async fn list_tenant_manifests(
+    remote_client: &GenericRemoteStorage,
+    tenant_id: TenantShardId,
+    root_target: &RootTarget,
+) -> anyhow::Result<ListTenantManifestResult> {
+    let mut errors = Vec::new();
+    let mut unknown_keys = Vec::new();
+
+    let mut tenant_root_target = root_target.tenant_root(&tenant_id);
+    let original_prefix = tenant_root_target.prefix_in_bucket.clone();
+    const TENANT_MANIFEST_STEM: &str = "tenant-manifest";
+    tenant_root_target.prefix_in_bucket += TENANT_MANIFEST_STEM;
+    tenant_root_target.delimiter = String::new();
+
+    let mut manifests: Vec<(Generation, ListingObject)> = Vec::new();
+
+    let prefix_str = &original_prefix
+        .strip_prefix("/")
+        .unwrap_or(&original_prefix);
+
+    let mut stream = std::pin::pin!(stream_listing(remote_client, &tenant_root_target));
+    'outer: while let Some(obj) = stream.next().await {
+        let (key, Some(obj)) = obj? else {
+            panic!("ListingObject not specified");
+        };
+
+        'err: {
+            // TODO a let chain would be nicer here.
+            let Some(name) = key.object_name() else {
+                break 'err;
+            };
+            if !name.starts_with(TENANT_MANIFEST_STEM) {
+                break 'err;
+            }
+            let Some(generation) = parse_remote_tenant_manifest_path(key.clone()) else {
+                break 'err;
+            };
+            tracing::debug!("tenant manifest {key}");
+            manifests.push((generation, obj));
+            continue 'outer;
+        }
+        tracing::info!("Listed an unknown key: {key}");
+        unknown_keys.push(obj);
+    }
+
+    if manifests.is_empty() {
+        tracing::debug!("No manifest for timeline.");
+
+        return Ok(ListTenantManifestResult::WithErrors {
+            errors,
+            unknown_keys,
+        });
+    }
+    if !unknown_keys.is_empty() {
+        errors.push(((*prefix_str).to_owned(), "unknown keys listed".to_string()));
+
+        return Ok(ListTenantManifestResult::WithErrors {
+            errors,
+            unknown_keys,
+        });
+    }
+
+    // Find the manifest with the highest generation
+    let (latest_generation, latest_listing_object) = manifests
+        .iter()
+        .max_by_key(|i| i.0)
+        .map(|(g, obj)| (*g, obj.clone()))
+        .unwrap();
+
+    let manifest_bytes =
+        match download_object_with_retries(remote_client, &latest_listing_object.key).await {
+            Ok(bytes) => bytes,
+            Err(e) => {
+                // It is possible that the tenant gets deleted in-between we list the objects
+                // and we download the manifest file.
+                errors.push((
+                    latest_listing_object.key.get_path().as_str().to_owned(),
+                    format!("failed to download tenant-manifest.json: {e}"),
+                ));
+                return Ok(ListTenantManifestResult::WithErrors {
+                    errors,
+                    unknown_keys,
+                });
+            }
+        };
+
+    match TenantManifest::from_json_bytes(&manifest_bytes) {
+        Ok(_manifest) => {
+            return Ok(ListTenantManifestResult::NoErrors(
+                RemoteTenantManifestInfo {
+                    latest_generation: Some(latest_generation),
+                    manifests,
+                },
+            ));
+        }
+        Err(parse_error) => errors.push((
+            latest_listing_object.key.get_path().as_str().to_owned(),
+            format!("tenant-manifest.json body parsing error: {parse_error}"),
+        )),
+    }
+
+    if errors.is_empty() {
+        errors.push((
+            (*prefix_str).to_owned(),
+            "Unexpected: no errors did not lead to a successfully parsed blob return".to_string(),
+        ));
+    }
+
+    Ok(ListTenantManifestResult::WithErrors {
+        errors,
+        unknown_keys,
+    })
+}
diff --git a/storage_scrubber/src/pageserver_physical_gc.rs b/storage_scrubber/src/pageserver_physical_gc.rs
index 1e69ddbf150c..20cb9c3633ac 100644
--- a/storage_scrubber/src/pageserver_physical_gc.rs
+++ b/storage_scrubber/src/pageserver_physical_gc.rs
@@ -2,12 +2,16 @@ use std::collections::{BTreeMap, BTreeSet, HashMap};
 use std::sync::Arc;
 use std::time::Duration;
 
-use crate::checks::{list_timeline_blobs, BlobDataParseResult};
+use crate::checks::{
+    list_tenant_manifests, list_timeline_blobs, BlobDataParseResult, ListTenantManifestResult,
+};
 use crate::metadata_stream::{stream_tenant_timelines, stream_tenants};
 use crate::{init_remote, BucketConfig, NodeKind, RootTarget, TenantShardTimelineId, MAX_RETRIES};
 use futures_util::{StreamExt, TryStreamExt};
 use pageserver::tenant::remote_timeline_client::index::LayerFileMetadata;
-use pageserver::tenant::remote_timeline_client::{parse_remote_index_path, remote_layer_path};
+use pageserver::tenant::remote_timeline_client::{
+    parse_remote_index_path, parse_remote_tenant_manifest_path, remote_layer_path,
+};
 use pageserver::tenant::storage_layer::LayerName;
 use pageserver::tenant::IndexPart;
 use pageserver_api::controller_api::TenantDescribeResponse;
@@ -25,6 +29,7 @@ use utils::id::{TenantId, TenantTimelineId};
 #[derive(Serialize, Default)]
 pub struct GcSummary {
     indices_deleted: usize,
+    tenant_manifests_deleted: usize,
     remote_storage_errors: usize,
     controller_api_errors: usize,
     ancestor_layers_deleted: usize,
@@ -34,12 +39,14 @@ impl GcSummary {
     fn merge(&mut self, other: Self) {
         let Self {
             indices_deleted,
+            tenant_manifests_deleted,
             remote_storage_errors,
             ancestor_layers_deleted,
             controller_api_errors,
         } = other;
 
         self.indices_deleted += indices_deleted;
+        self.tenant_manifests_deleted += tenant_manifests_deleted;
         self.remote_storage_errors += remote_storage_errors;
         self.ancestor_layers_deleted += ancestor_layers_deleted;
         self.controller_api_errors += controller_api_errors;
@@ -352,6 +359,69 @@ async fn maybe_delete_index(
     }
 }
 
+async fn maybe_delete_tenant_manifest(
+    remote_client: &GenericRemoteStorage,
+    min_age: &Duration,
+    latest_gen: Generation,
+    obj: &ListingObject,
+    mode: GcMode,
+    summary: &mut GcSummary,
+) {
+    // Validation: we will only delete things that parse cleanly
+    let basename = obj.key.get_path().file_name().unwrap();
+    let Some(candidate_generation) =
+        parse_remote_tenant_manifest_path(RemotePath::from_string(basename).unwrap())
+    else {
+        // A strange key: we will not delete this because we don't understand it.
+        tracing::warn!("Bad index key");
+        return;
+    };
+
+    // Validation: we will only delete manifests more than one generation old, and in fact we
+    // should never be called with such recent generations.
+    if candidate_generation >= latest_gen {
+        tracing::warn!("Deletion candidate is >= latest generation, this is a bug!");
+        return;
+    } else if candidate_generation.next() == latest_gen {
+        tracing::warn!("Deletion candidate is >= latest generation - 1, this is a bug!");
+        return;
+    }
+
+    if !is_old_enough(min_age, obj, summary) {
+        return;
+    }
+
+    if matches!(mode, GcMode::DryRun) {
+        tracing::info!("Dry run: would delete this key");
+        return;
+    }
+
+    // All validations passed: erase the object
+    let cancel = CancellationToken::new();
+    match backoff::retry(
+        || remote_client.delete(&obj.key, &cancel),
+        |_| false,
+        3,
+        MAX_RETRIES as u32,
+        "maybe_delete_tenant_manifest",
+        &cancel,
+    )
+    .await
+    {
+        None => {
+            unreachable!("Using a dummy cancellation token");
+        }
+        Some(Ok(_)) => {
+            tracing::info!("Successfully deleted tenant manifest");
+            summary.tenant_manifests_deleted += 1;
+        }
+        Some(Err(e)) => {
+            tracing::warn!("Failed to delete tenant manifest: {e}");
+            summary.remote_storage_errors += 1;
+        }
+    }
+}
+
 #[allow(clippy::too_many_arguments)]
 async fn gc_ancestor(
     remote_client: &GenericRemoteStorage,
@@ -451,13 +521,100 @@ async fn gc_ancestor(
     Ok(())
 }
 
+async fn gc_tenant_manifests(
+    remote_client: &GenericRemoteStorage,
+    min_age: Duration,
+    target: &RootTarget,
+    mode: GcMode,
+    tenant_shard_id: TenantShardId,
+) -> anyhow::Result<GcSummary> {
+    let mut gc_summary = GcSummary::default();
+    match list_tenant_manifests(remote_client, tenant_shard_id, target).await? {
+        ListTenantManifestResult::WithErrors {
+            errors,
+            unknown_keys: _,
+        } => {
+            for (_key, error) in errors {
+                tracing::warn!(%tenant_shard_id, "list_tenant_manifests: {error}");
+            }
+        }
+        ListTenantManifestResult::NoErrors(mut manifest_info) => {
+            let Some(latest_gen) = manifest_info.latest_generation else {
+                return Ok(gc_summary);
+            };
+            manifest_info
+                .manifests
+                .sort_by_key(|(generation, _obj)| *generation);
+            // skip the two latest generations (they don't neccessarily have to be 1 apart from each other)
+            let candidates = manifest_info.manifests.iter().rev().skip(2);
+            for (_generation, key) in candidates {
+                maybe_delete_tenant_manifest(
+                    remote_client,
+                    &min_age,
+                    latest_gen,
+                    key,
+                    mode,
+                    &mut gc_summary,
+                )
+                .instrument(
+                    info_span!("maybe_delete_tenant_manifest", %tenant_shard_id, ?latest_gen, %key.key),
+                )
+                .await;
+            }
+        }
+    }
+    Ok(gc_summary)
+}
+
+async fn gc_timeline(
+    remote_client: &GenericRemoteStorage,
+    min_age: &Duration,
+    target: &RootTarget,
+    mode: GcMode,
+    ttid: TenantShardTimelineId,
+    accumulator: &Arc<std::sync::Mutex<TenantRefAccumulator>>,
+) -> anyhow::Result<GcSummary> {
+    let mut summary = GcSummary::default();
+    let data = list_timeline_blobs(remote_client, ttid, target).await?;
+
+    let (index_part, latest_gen, candidates) = match &data.blob_data {
+        BlobDataParseResult::Parsed {
+            index_part,
+            index_part_generation,
+            s3_layers: _s3_layers,
+        } => (index_part, *index_part_generation, data.unused_index_keys),
+        BlobDataParseResult::Relic => {
+            // Post-deletion tenant location: don't try and GC it.
+            return Ok(summary);
+        }
+        BlobDataParseResult::Incorrect {
+            errors,
+            s3_layers: _,
+        } => {
+            // Our primary purpose isn't to report on bad data, but log this rather than skipping silently
+            tracing::warn!("Skipping timeline {ttid}, bad metadata: {errors:?}");
+            return Ok(summary);
+        }
+    };
+
+    accumulator.lock().unwrap().update(ttid, index_part);
+
+    for key in candidates {
+        maybe_delete_index(remote_client, min_age, latest_gen, &key, mode, &mut summary)
+            .instrument(info_span!("maybe_delete_index", %ttid, ?latest_gen, %key.key))
+            .await;
+    }
+
+    Ok(summary)
+}
+
 /// Physical garbage collection: removing unused S3 objects.
 ///
 /// This is distinct from the garbage collection done inside the pageserver, which operates at a higher level
 /// (keys, layers).  This type of garbage collection is about removing:
 /// - Objects that were uploaded but never referenced in the remote index (e.g. because of a shutdown between
 ///   uploading a layer and uploading an index)
-/// - Index objects from historic generations
+/// - Index objects and tenant manifests from historic generations
 ///
 /// This type of GC is not necessary for correctness: rather it serves to reduce wasted storage capacity, and
 /// make sure that object listings don't get slowed down by large numbers of garbage objects.
@@ -470,6 +627,7 @@ pub async fn pageserver_physical_gc(
 ) -> anyhow::Result<GcSummary> {
     let (remote_client, target) = init_remote(bucket_config.clone(), NodeKind::Pageserver).await?;
 
+    let remote_client = Arc::new(remote_client);
     let tenants = if tenant_shard_ids.is_empty() {
         futures::future::Either::Left(stream_tenants(&remote_client, &target))
     } else {
@@ -484,59 +642,59 @@ pub async fn pageserver_physical_gc(
     let accumulator = Arc::new(std::sync::Mutex::new(TenantRefAccumulator::default()));
 
     // Generate a stream of TenantTimelineId
-    let timelines = tenants.map_ok(|t| stream_tenant_timelines(&remote_client, &target, t));
-    let timelines = timelines.try_buffered(CONCURRENCY);
-    let timelines = timelines.try_flatten();
-
-    // Generate a stream of S3TimelineBlobData
-    async fn gc_timeline(
-        remote_client: &GenericRemoteStorage,
-        min_age: &Duration,
-        target: &RootTarget,
-        mode: GcMode,
-        ttid: TenantShardTimelineId,
-        accumulator: &Arc<std::sync::Mutex<TenantRefAccumulator>>,
-    ) -> anyhow::Result<GcSummary> {
-        let mut summary = GcSummary::default();
-        let data = list_timeline_blobs(remote_client, ttid, target).await?;
-
-        let (index_part, latest_gen, candidates) = match &data.blob_data {
-            BlobDataParseResult::Parsed {
-                index_part,
-                index_part_generation,
-                s3_layers: _s3_layers,
-            } => (index_part, *index_part_generation, data.unused_index_keys),
-            BlobDataParseResult::Relic => {
-                // Post-deletion tenant location: don't try and GC it.
-                return Ok(summary);
-            }
-            BlobDataParseResult::Incorrect {
-                errors,
-                s3_layers: _,
-            } => {
-                // Our primary purpose isn't to report on bad data, but log this rather than skipping silently
-                tracing::warn!("Skipping timeline {ttid}, bad metadata: {errors:?}");
-                return Ok(summary);
-            }
-        };
-
-        accumulator.lock().unwrap().update(ttid, index_part);
-
-        for key in candidates {
-            maybe_delete_index(remote_client, min_age, latest_gen, &key, mode, &mut summary)
-                .instrument(info_span!("maybe_delete_index", %ttid, ?latest_gen, %key.key))
-                .await;
-        }
-
-        Ok(summary)
+    enum GcSummaryOrContent<T> {
+        Content(T),
+        GcSummary(GcSummary),
     }
+    let timelines = tenants.map_ok(|tenant_shard_id| {
+        let target_ref = &target;
+        let remote_client_ref = &remote_client;
+        async move {
+            let summaries_from_manifests = match gc_tenant_manifests(
+                remote_client_ref,
+                min_age,
+                target_ref,
+                mode,
+                tenant_shard_id,
+            )
+            .await
+            {
+                Ok(gc_summary) => vec![Ok(GcSummaryOrContent::<TenantShardTimelineId>::GcSummary(
+                    gc_summary,
+                ))],
+                Err(e) => {
+                    tracing::warn!(%tenant_shard_id, "Error in gc_tenant_manifests: {e}");
+                    Vec::new()
+                }
+            };
+            stream_tenant_timelines(remote_client_ref, target_ref, tenant_shard_id)
+                .await
+                .map(|stream| {
+                    stream
+                        .map_ok(GcSummaryOrContent::Content)
+                        .chain(futures::stream::iter(summaries_from_manifests.into_iter()))
+                })
+        }
+    });
+    let timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
+    let timelines = timelines.try_flatten();
 
     let mut summary = GcSummary::default();
 
     // Drain futures for per-shard GC, populating accumulator as a side effect
     {
-        let timelines = timelines.map_ok(|ttid| {
-            gc_timeline(&remote_client, &min_age, &target, mode, ttid, &accumulator)
+        let timelines = timelines.map_ok(|summary_or_ttid| match summary_or_ttid {
+            GcSummaryOrContent::Content(ttid) => futures::future::Either::Left(gc_timeline(
+                &remote_client,
+                &min_age,
+                &target,
+                mode,
+                ttid,
+                &accumulator,
+            )),
+            GcSummaryOrContent::GcSummary(gc_summary) => {
+                futures::future::Either::Right(futures::future::ok(gc_summary))
+            }
         });
         let mut timelines = std::pin::pin!(timelines.try_buffered(CONCURRENCY));
 
diff --git a/test_runner/regress/test_timeline_archive.py b/test_runner/regress/test_timeline_archive.py
index 5a1e493bbec7..e808dd13966c 100644
--- a/test_runner/regress/test_timeline_archive.py
+++ b/test_runner/regress/test_timeline_archive.py
@@ -835,3 +835,117 @@ def test_timeline_retain_lsn(
         with env.endpoints.create_start("test_archived_branch", tenant_id=tenant_id) as endpoint:
             sum = endpoint.safe_psql("SELECT sum(key) from foo where v < 51200")
             assert sum == pre_branch_sum
+
+
+def test_timeline_offload_generations(neon_env_builder: NeonEnvBuilder):
+    """
+    Test for scrubber deleting old generations of manifests
+    """
+    remote_storage_kind = s3_storage()
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+
+    env = neon_env_builder.init_start()
+    ps_http = env.pageserver.http_client()
+
+    # Turn off gc and compaction loops: we want to issue them manually for better reliability
+    tenant_id, root_timeline_id = env.create_tenant(
+        conf={
+            "gc_period": "0s",
+            "compaction_period": "0s",
+            "checkpoint_distance": f"{1024 ** 2}",
+        }
+    )
+
+    # Create a branch and archive it
+    child_timeline_id = env.create_branch("test_archived_branch_persisted", tenant_id)
+
+    with env.endpoints.create_start(
+        "test_archived_branch_persisted", tenant_id=tenant_id
+    ) as endpoint:
+        endpoint.safe_psql_many(
+            [
+                "CREATE TABLE foo(key serial primary key, t text default 'data_content')",
+                "INSERT INTO foo SELECT FROM generate_series(1,512)",
+            ]
+        )
+        sum = endpoint.safe_psql("SELECT sum(key) from foo where key % 3 = 2")
+        last_flush_lsn_upload(env, endpoint, tenant_id, child_timeline_id)
+
+    assert_prefix_not_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix=f"tenants/{str(tenant_id)}/",
+    )
+    assert_prefix_empty(
+        neon_env_builder.pageserver_remote_storage,
+        prefix=f"tenants/{str(tenant_id)}/tenant-manifest",
+    )
+
+    ps_http.timeline_archival_config(
+        tenant_id,
+        child_timeline_id,
+        state=TimelineArchivalState.ARCHIVED,
+    )
+
+    def timeline_offloaded_api(timeline_id: TimelineId) -> bool:
+        # TODO add a proper API to check if a timeline has been offloaded or not
+        return not any(
+            timeline["timeline_id"] == str(timeline_id)
+            for timeline in ps_http.timeline_list(tenant_id=tenant_id)
+        )
+
+    def child_offloaded():
+        ps_http.timeline_offload(tenant_id=tenant_id, timeline_id=child_timeline_id)
+        assert timeline_offloaded_api(child_timeline_id)
+
+    wait_until(child_offloaded)
+
+    assert timeline_offloaded_api(child_timeline_id)
+    assert not timeline_offloaded_api(root_timeline_id)
+
+    # Reboot the pageserver a bunch of times, do unoffloads, offloads
+    for i in range(5):
+        env.pageserver.stop()
+        env.pageserver.start()
+
+        assert timeline_offloaded_api(child_timeline_id)
+        assert not timeline_offloaded_api(root_timeline_id)
+
+        ps_http.timeline_archival_config(
+            tenant_id,
+            child_timeline_id,
+            state=TimelineArchivalState.UNARCHIVED,
+        )
+
+        assert not timeline_offloaded_api(child_timeline_id)
+
+        if i % 2 == 0:
+            with env.endpoints.create_start(
+                "test_archived_branch_persisted", tenant_id=tenant_id
+            ) as endpoint:
+                sum_again = endpoint.safe_psql("SELECT sum(key) from foo where key % 3 = 2")
+                assert sum == sum_again
+
+        ps_http.timeline_archival_config(
+            tenant_id,
+            child_timeline_id,
+            state=TimelineArchivalState.ARCHIVED,
+        )
+        wait_until(child_offloaded)
+
+    #
+    # Now ensure that scrubber runs will clean up old generations' manifests.
+    #
+
+    # Sleep some amount larger than min_age_secs
+    time.sleep(3)
+
+    # Ensure that min_age_secs has a deletion impeding effect
+    gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=3600, mode="full")
+    assert gc_summary["remote_storage_errors"] == 0
+    assert gc_summary["indices_deleted"] == 0
+    assert gc_summary["tenant_manifests_deleted"] == 0
+
+    gc_summary = env.storage_scrubber.pageserver_physical_gc(min_age_secs=1, mode="full")
+    assert gc_summary["remote_storage_errors"] == 0
+    assert gc_summary["indices_deleted"] > 0
+    assert gc_summary["tenant_manifests_deleted"] > 0

From 944c1adc4ce5534b90306b217de3d71b282d07fe Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 3 Dec 2024 23:07:03 +0100
Subject: [PATCH 27/65] tests & benchmarks: unify the way we customize the
 default tenant config (#9992)

Before this PR, some override callbacks used `.default()`, others
used `.setdefault()`.

As of this PR, all callbacks use `.setdefault()` which I think is least
prone to failure.

Aligning on a single way will set the right example for future tests
that need such customization.

The `test_pageserver_getpage_throttle.py` technically is a change in
behavior: before, it replaced the `tenant_config` field, now it just
configures the throttle. This is what I believe is intended anyway.
---
 test_runner/performance/test_branch_creation.py         | 3 +--
 test_runner/regress/test_disk_usage_eviction.py         | 3 +--
 test_runner/regress/test_pageserver_getpage_throttle.py | 3 ++-
 3 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/test_runner/performance/test_branch_creation.py b/test_runner/performance/test_branch_creation.py
index 3ce27d6cd3eb..cf2212d447f2 100644
--- a/test_runner/performance/test_branch_creation.py
+++ b/test_runner/performance/test_branch_creation.py
@@ -142,10 +142,9 @@ def test_branch_creation_many(neon_compare: NeonCompare, n_branches: int, shape:
     # start without gc so we can time compaction with less noise; use shorter
     # period for compaction so it starts earlier
     def patch_default_tenant_config(config):
-        tenant_config = config.get("tenant_config", {})
+        tenant_config = config.setdefault("tenant_config", {})
         tenant_config["compaction_period"] = "3s"
         tenant_config["gc_period"] = "0s"
-        config["tenant_config"] = tenant_config
 
     env.pageserver.edit_config_toml(patch_default_tenant_config)
     env.pageserver.start(
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 05956b5b9378..954db914b9ea 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -62,9 +62,8 @@ def assert_overrides(tenant_id, default_tenant_conf_value):
     if config_level_override is not None:
 
         def set_min_resident_size(config):
-            tenant_config = config.get("tenant_config", {})
+            tenant_config = config.setdefault("tenant_config", {})
             tenant_config["min_resident_size_override"] = config_level_override
-            config["tenant_config"] = tenant_config
 
         env.pageserver.edit_config_toml(set_min_resident_size)
     env.pageserver.stop()
diff --git a/test_runner/regress/test_pageserver_getpage_throttle.py b/test_runner/regress/test_pageserver_getpage_throttle.py
index 6d0661f068ca..9644ebe3e2be 100644
--- a/test_runner/regress/test_pageserver_getpage_throttle.py
+++ b/test_runner/regress/test_pageserver_getpage_throttle.py
@@ -183,7 +183,8 @@ def test_throttle_fair_config_is_settable_but_ignored_in_config_toml(
     """
 
     def set_tenant_config(ps_cfg):
-        ps_cfg["tenant_config"] = {"timeline_get_throttle": throttle_config_with_field_fair_set}
+        tenant_config = ps_cfg.setdefault("tenant_config", {})
+        tenant_config["timeline_get_throttle"] = throttle_config_with_field_fair_set
 
     neon_env_builder.pageserver_config_override = set_tenant_config
     env = neon_env_builder.init_start()

From 023821a80c68531a487d54353640d029d8b354f3 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Tue, 3 Dec 2024 22:46:18 +0000
Subject: [PATCH 28/65] test_page_service_batching: fix non-numeric metrics
 (#9998)

## Problem

```
2024-12-03T15:42:46.5978335Z + poetry run python /__w/neon/neon/scripts/ingest_perf_test_result.py --ingest /__w/neon/neon/test_runner/perf-report-local
2024-12-03T15:42:49.5325077Z Traceback (most recent call last):
2024-12-03T15:42:49.5325603Z   File "/__w/neon/neon/scripts/ingest_perf_test_result.py", line 165, in <module>
2024-12-03T15:42:49.5326029Z     main()
2024-12-03T15:42:49.5326316Z   File "/__w/neon/neon/scripts/ingest_perf_test_result.py", line 155, in main
2024-12-03T15:42:49.5326739Z     ingested = ingest_perf_test_result(cur, item, recorded_at_timestamp)
2024-12-03T15:42:49.5327488Z                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
2024-12-03T15:42:49.5327914Z   File "/__w/neon/neon/scripts/ingest_perf_test_result.py", line 99, in ingest_perf_test_result
2024-12-03T15:42:49.5328321Z     psycopg2.extras.execute_values(
2024-12-03T15:42:49.5328940Z   File "/github/home/.cache/pypoetry/virtualenvs/non-package-mode-_pxWMzVK-py3.11/lib/python3.11/site-packages/psycopg2/extras.py", line 1299, in execute_values
2024-12-03T15:42:49.5335618Z     cur.execute(b''.join(parts))
2024-12-03T15:42:49.5335967Z psycopg2.errors.InvalidTextRepresentation: invalid input syntax for type numeric: "concurrent-futures"
2024-12-03T15:42:49.5336287Z LINE 57:             'concurrent-futures',
2024-12-03T15:42:49.5336462Z                      ^
```

## Summary of changes
- `test_page_service_batching`: save non-numeric params as `labels`
- Add a runtime check that `metric_value` is NUMERIC
---
 test_runner/fixtures/benchmark_fixture.py             | 10 ++++++++++
 .../pageserver/test_page_service_batching.py          | 11 ++++-------
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py
index bb8e75902e77..fa3747c08f29 100644
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -266,6 +266,16 @@ def record(
         name = f"{self.PROPERTY_PREFIX}_{metric_name}"
         if labels is None:
             labels = {}
+
+        # Sometimes mypy can't catch non-numeric values,
+        # so adding a check here
+        try:
+            float(metric_value)
+        except ValueError as e:
+            raise ValueError(
+                f"`metric_value` (`{metric_value}`) must be a NUMERIC-friendly data type"
+            ) from e
+
         self.property_recorder(
             name,
             {
diff --git a/test_runner/performance/pageserver/test_page_service_batching.py b/test_runner/performance/pageserver/test_page_service_batching.py
index 562094a059d2..2c27368001b3 100644
--- a/test_runner/performance/pageserver/test_page_service_batching.py
+++ b/test_runner/performance/pageserver/test_page_service_batching.py
@@ -116,21 +116,18 @@ def test_throughput(
             # name is not a metric, we just use it to identify the test easily in the `test_...[...]`` notation
         }
     )
-    params.update(
-        {
-            f"pipelining_config.{k}": (v, {})
-            for k, v in dataclasses.asdict(pipelining_config).items()
-        }
-    )
+    # For storing configuration as a metric, insert a fake 0 with labels with actual data
+    params.update({"pipelining_config": (0, {"labels": dataclasses.asdict(pipelining_config)})})
 
     log.info("params: %s", params)
 
     for param, (value, kwargs) in params.items():
         zenbenchmark.record(
             param,
-            metric_value=value,
+            metric_value=float(value),
             unit=kwargs.pop("unit", ""),
             report=MetricReport.TEST_PARAM,
+            labels=kwargs.pop("labels", None),
             **kwargs,
         )
 

From 8d93d02c2f3215226efbcbd65d71de82e0ade023 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Wed, 4 Dec 2024 01:07:49 +0100
Subject: [PATCH 29/65] page_service: enable batching in Rust & Python Tests +
 Python benchmarks (#9993)

This is the first step towards batching rollout.

Refs

- rollout plan: https://github.com/neondatabase/cloud/issues/20620
- task https://github.com/neondatabase/neon/issues/9377
- uber-epic: https://github.com/neondatabase/neon/issues/9376
---
 libs/pageserver_api/src/config.rs     |  9 ++++++++-
 test_runner/fixtures/neon_fixtures.py | 11 +++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index e49d15ba87a0..09cfbc55fd1c 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -442,7 +442,14 @@ impl Default for ConfigToml {
             tenant_config: TenantConfigToml::default(),
             no_sync: None,
             wal_receiver_protocol: DEFAULT_WAL_RECEIVER_PROTOCOL,
-            page_service_pipelining: PageServicePipeliningConfig::Serial,
+            page_service_pipelining: if !cfg!(test) {
+                PageServicePipeliningConfig::Serial
+            } else {
+                PageServicePipeliningConfig::Pipelined(PageServicePipeliningConfigPipelined {
+                    max_batch_size: NonZeroUsize::new(32).unwrap(),
+                    execution: PageServiceProtocolPipelinedExecutionStrategy::ConcurrentFutures,
+                })
+            },
         }
     }
 }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index f55f06bebc00..9c579373e8ea 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1095,6 +1095,17 @@ def __init__(self, config: NeonEnvBuilder):
                 # the pageserver taking a long time to start up due to syncfs flushing other tests' data
                 "no_sync": True,
             }
+
+            # Batching (https://github.com/neondatabase/neon/issues/9377):
+            # enable batching by default in tests and benchmarks.
+            # Compat tests are exempt because old versions fail to parse the new config.
+            if not config.compatibility_neon_binpath:
+                ps_cfg["page_service_pipelining"] = {
+                    "mode": "pipelined",
+                    "execution": "concurrent-futures",
+                    "max_batch_size": 32,
+                }
+
             if self.pageserver_virtual_file_io_engine is not None:
                 ps_cfg["virtual_file_io_engine"] = self.pageserver_virtual_file_io_engine
             if config.pageserver_default_tenant_config_compaction_algorithm is not None:

From 68205c48edab32f2d08523332a5a25af80eb0770 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 4 Dec 2024 09:25:29 +0000
Subject: [PATCH 30/65] storcon: return an error for drain attempts while
 paused (#9997)

## Problem

We currently allow drain operations to proceed while the node policy is
paused.

## Summary of changes

Return a precondition failed error in such cases. The orchestrator is
updated in https://github.com/neondatabase/infra/pull/2544 to skip drain
and fills if the pageserver is paused.

Closes: https://github.com/neondatabase/neon/issues/9907
---
 storage_controller/src/service.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 741d3dc2b4da..92ec58cb4d68 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -5681,7 +5681,7 @@ impl Service {
         }
 
         match node_policy {
-            NodeSchedulingPolicy::Active | NodeSchedulingPolicy::Pause => {
+            NodeSchedulingPolicy::Active => {
                 self.node_configure(node_id, None, Some(NodeSchedulingPolicy::Draining))
                     .await?;
 

From 1b3558df7a4fa3b6a44d7e5c5fd4c18fe4cd9acd Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Wed, 4 Dec 2024 12:07:22 +0100
Subject: [PATCH 31/65] optimize parms for ingest bench (#9999)

## Problem

we tried different parallelism settings for ingest bench

## Summary of changes

the following settings seem optimal after merging
- SK side Wal filtering
- batched getpages

Settings:
- effective_io_concurrency 100
- concurrency limit 200 (different from Prod!)
- jobs 4, maintenance workers 7
- 10 GB chunk size
---
 .github/workflows/ingest_benchmark.yml                 |  1 +
 .../performance/test_perf_ingest_using_pgcopydb.py     | 10 +++++-----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ingest_benchmark.yml b/.github/workflows/ingest_benchmark.yml
index 1033dc6489f7..a5810e91a42b 100644
--- a/.github/workflows/ingest_benchmark.yml
+++ b/.github/workflows/ingest_benchmark.yml
@@ -26,6 +26,7 @@ concurrency:
 jobs:
   ingest:
     strategy:
+      fail-fast: false # allow other variants to continue even if one fails
       matrix:
         target_project: [new_empty_project, large_existing_project]  
     permissions:
diff --git a/test_runner/performance/test_perf_ingest_using_pgcopydb.py b/test_runner/performance/test_perf_ingest_using_pgcopydb.py
index 37f2e9db5026..2f4574ba8806 100644
--- a/test_runner/performance/test_perf_ingest_using_pgcopydb.py
+++ b/test_runner/performance/test_perf_ingest_using_pgcopydb.py
@@ -60,13 +60,13 @@ def build_pgcopydb_command(pgcopydb_filter_file: Path, test_output_dir: Path):
         "--no-acl",
         "--skip-db-properties",
         "--table-jobs",
-        "8",
+        "4",
         "--index-jobs",
-        "8",
+        "4",
         "--restore-jobs",
-        "8",
+        "4",
         "--split-tables-larger-than",
-        "5GB",
+        "10GB",
         "--skip-extensions",
         "--use-copy-binary",
         "--filters",
@@ -136,7 +136,7 @@ def run_command_and_log_output(command, log_file_path: Path):
         "LD_LIBRARY_PATH": f"{os.getenv('PGCOPYDB_LIB_PATH')}:{os.getenv('PG_16_LIB_PATH')}",
         "PGCOPYDB_SOURCE_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_SOURCE_CONNSTR")),
         "PGCOPYDB_TARGET_PGURI": cast(str, os.getenv("BENCHMARK_INGEST_TARGET_CONNSTR")),
-        "PGOPTIONS": "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=16",
+        "PGOPTIONS": "-c maintenance_work_mem=8388608 -c max_parallel_maintenance_workers=7",
     }
     # Combine the current environment with custom variables
     env = os.environ.copy()

From 9d75218ba7ad6340abfaa9a9bfbbe6fa443841f0 Mon Sep 17 00:00:00 2001
From: Peter Bendel <peterbendel@neon.tech>
Date: Wed, 4 Dec 2024 12:37:24 +0100
Subject: [PATCH 32/65] fix parsing human time output like "50m37s" (#10001)

## Problem

In ingest_benchmark.yml workflow we use pgcopydb tool to migrate
project.
pgcopydb logs human time.

Our parsing of the human time doesn't work for times like "50m37s".

[Example
workflow](https://github.com/neondatabase/neon/actions/runs/12145539948/job/33867418065#step:10:479)

contains "57m45s"

but we
[reported](https://github.com/neondatabase/neon/actions/runs/12145539948/job/33867418065#step:10:500)
only the seconds part:
45.000 s


## Summary of changes

add a regex pattern for Minute/Second combination
---
 test_runner/performance/test_perf_ingest_using_pgcopydb.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/performance/test_perf_ingest_using_pgcopydb.py b/test_runner/performance/test_perf_ingest_using_pgcopydb.py
index 2f4574ba8806..f0a0c1f5a251 100644
--- a/test_runner/performance/test_perf_ingest_using_pgcopydb.py
+++ b/test_runner/performance/test_perf_ingest_using_pgcopydb.py
@@ -184,7 +184,7 @@ def parse_log_and_report_metrics(
             for metric_name, pattern in metric_patterns.items():
                 if pattern.search(line):
                     # Extract duration and convert it to seconds
-                    duration_match = re.search(r"\d+h\d+m|\d+s|\d+ms|\d+\.\d+s", line)
+                    duration_match = re.search(r"\d+h\d+m|\d+m\d+s|\d+s|\d+ms|\d+\.\d+s", line)
                     if duration_match:
                         duration_str = duration_match.group(0)
                         parts = re.findall(r"\d+[a-zA-Z]+", duration_str)

From 7b18e33997b861ce92bce9192007f87ab45708a9 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 4 Dec 2024 13:53:52 +0100
Subject: [PATCH 33/65] pageserver: return proper status code for
 heatmap_upload errors (#9991)

## Problem

During deploys, we see a lot of 500 errors due to heapmap uploads for
inactive tenants. These should be 503s instead.

Resolves #9574.

## Summary of changes

Make the secondary tenant scheduler use `ApiError` rather than
`anyhow::Error`, to propagate the tenant error and convert it to an
appropriate status code.
---
 pageserver/src/http/routes.rs                 | 28 ++++++++++++----
 pageserver/src/tenant.rs                      |  2 +-
 pageserver/src/tenant/mgr.rs                  |  5 ++-
 pageserver/src/tenant/secondary.rs            | 33 +++++++++++++++----
 pageserver/src/tenant/secondary/downloader.rs | 13 ++++----
 .../src/tenant/secondary/heatmap_uploader.rs  | 10 +++---
 pageserver/src/tenant/secondary/scheduler.rs  |  4 +--
 7 files changed, 68 insertions(+), 27 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index e127871549ea..e04f1460a8f2 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -279,7 +279,10 @@ impl From<TenantStateError> for ApiError {
 impl From<GetTenantError> for ApiError {
     fn from(tse: GetTenantError) -> ApiError {
         match tse {
-            GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {}", tid).into()),
+            GetTenantError::NotFound(tid) => ApiError::NotFound(anyhow!("tenant {tid}").into()),
+            GetTenantError::ShardNotFound(tid) => {
+                ApiError::NotFound(anyhow!("tenant {tid}").into())
+            }
             GetTenantError::NotActive(_) => {
                 // Why is this not `ApiError::NotFound`?
                 // Because we must be careful to never return 404 for a tenant if it does
@@ -387,6 +390,16 @@ impl From<crate::tenant::mgr::DeleteTenantError> for ApiError {
     }
 }
 
+impl From<crate::tenant::secondary::SecondaryTenantError> for ApiError {
+    fn from(ste: crate::tenant::secondary::SecondaryTenantError) -> ApiError {
+        use crate::tenant::secondary::SecondaryTenantError;
+        match ste {
+            SecondaryTenantError::GetTenant(gte) => gte.into(),
+            SecondaryTenantError::ShuttingDown => ApiError::ShuttingDown,
+        }
+    }
+}
+
 // Helper function to construct a TimelineInfo struct for a timeline
 async fn build_timeline_info(
     timeline: &Arc<Timeline>,
@@ -1047,9 +1060,11 @@ async fn timeline_delete_handler(
             match e {
                 // GetTenantError has a built-in conversion to ApiError, but in this context we don't
                 // want to treat missing tenants as 404, to avoid ambiguity with successful deletions.
-                GetTenantError::NotFound(_) => ApiError::PreconditionFailed(
-                    "Requested tenant is missing".to_string().into_boxed_str(),
-                ),
+                GetTenantError::NotFound(_) | GetTenantError::ShardNotFound(_) => {
+                    ApiError::PreconditionFailed(
+                        "Requested tenant is missing".to_string().into_boxed_str(),
+                    )
+                }
                 e => e.into(),
             }
         })?;
@@ -2462,8 +2477,7 @@ async fn secondary_upload_handler(
     state
         .secondary_controller
         .upload_tenant(tenant_shard_id)
-        .await
-        .map_err(ApiError::InternalServerError)?;
+        .await?;
 
     json_response(StatusCode::OK, ())
 }
@@ -2578,7 +2592,7 @@ async fn secondary_download_handler(
         // Edge case: downloads aren't usually fallible: things like a missing heatmap are considered
         // okay.  We could get an error here in the unlikely edge case that the tenant
         // was detached between our check above and executing the download job.
-        Ok(Err(e)) => return Err(ApiError::InternalServerError(e)),
+        Ok(Err(e)) => return Err(e.into()),
         // A timeout is not an error: we have started the download, we're just not done
         // yet.  The caller will get a response body indicating status.
         Err(_) => StatusCode::ACCEPTED,
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index ada5c4a97705..5a9e398586f6 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3422,7 +3422,7 @@ impl Tenant {
                             r.map_err(
                             |_e: tokio::sync::watch::error::RecvError|
                                 // Tenant existed but was dropped: report it as non-existent
-                                GetActiveTenantError::NotFound(GetTenantError::NotFound(self.tenant_shard_id.tenant_id))
+                                GetActiveTenantError::NotFound(GetTenantError::ShardNotFound(self.tenant_shard_id))
                         )?
                         }
                         Err(TimeoutCancellableError::Cancelled) => {
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 45481c4ed44e..e8b0d1d4dd64 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -894,7 +894,7 @@ impl TenantManager {
             Some(TenantSlot::Attached(tenant)) => Ok(Arc::clone(tenant)),
             Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)),
             None | Some(TenantSlot::Secondary(_)) => {
-                Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
+                Err(GetTenantError::ShardNotFound(tenant_shard_id))
             }
         }
     }
@@ -2258,6 +2258,9 @@ pub(crate) enum GetTenantError {
     #[error("Tenant {0} not found")]
     NotFound(TenantId),
 
+    #[error("Tenant {0} not found")]
+    ShardNotFound(TenantShardId),
+
     #[error("Tenant {0} is not active")]
     NotActive(TenantShardId),
 
diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs
index 3df89a928cb2..4bc208331b35 100644
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -22,6 +22,7 @@ use super::{
     mgr::TenantManager,
     span::debug_assert_current_span_has_tenant_id,
     storage_layer::LayerName,
+    GetTenantError,
 };
 
 use crate::metrics::SECONDARY_RESIDENT_PHYSICAL_SIZE;
@@ -66,7 +67,21 @@ struct CommandRequest<T> {
 }
 
 struct CommandResponse {
-    result: anyhow::Result<()>,
+    result: Result<(), SecondaryTenantError>,
+}
+
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum SecondaryTenantError {
+    #[error("{0}")]
+    GetTenant(GetTenantError),
+    #[error("shutting down")]
+    ShuttingDown,
+}
+
+impl From<GetTenantError> for SecondaryTenantError {
+    fn from(gte: GetTenantError) -> Self {
+        Self::GetTenant(gte)
+    }
 }
 
 // Whereas [`Tenant`] represents an attached tenant, this type represents the work
@@ -285,7 +300,7 @@ impl SecondaryController {
         &self,
         queue: &tokio::sync::mpsc::Sender<CommandRequest<T>>,
         payload: T,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), SecondaryTenantError> {
         let (response_tx, response_rx) = tokio::sync::oneshot::channel();
 
         queue
@@ -294,20 +309,26 @@ impl SecondaryController {
                 response_tx,
             })
             .await
-            .map_err(|_| anyhow::anyhow!("Receiver shut down"))?;
+            .map_err(|_| SecondaryTenantError::ShuttingDown)?;
 
         let response = response_rx
             .await
-            .map_err(|_| anyhow::anyhow!("Request dropped"))?;
+            .map_err(|_| SecondaryTenantError::ShuttingDown)?;
 
         response.result
     }
 
-    pub async fn upload_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
+    pub(crate) async fn upload_tenant(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> Result<(), SecondaryTenantError> {
         self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_shard_id))
             .await
     }
-    pub async fn download_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
+    pub(crate) async fn download_tenant(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> Result<(), SecondaryTenantError> {
         self.dispatch(
             &self.download_req_tx,
             DownloadCommand::Download(tenant_shard_id),
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 8d771dc40535..701e4cf04b49 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -35,7 +35,7 @@ use super::{
         self, period_jitter, period_warmup, Completion, JobGenerator, SchedulingResult,
         TenantBackgroundJobs,
     },
-    SecondaryTenant,
+    GetTenantError, SecondaryTenant, SecondaryTenantError,
 };
 
 use crate::tenant::{
@@ -470,15 +470,16 @@ impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCo
         result
     }
 
-    fn on_command(&mut self, command: DownloadCommand) -> anyhow::Result<PendingDownload> {
+    fn on_command(
+        &mut self,
+        command: DownloadCommand,
+    ) -> Result<PendingDownload, SecondaryTenantError> {
         let tenant_shard_id = command.get_tenant_shard_id();
 
         let tenant = self
             .tenant_manager
-            .get_secondary_tenant_shard(*tenant_shard_id);
-        let Some(tenant) = tenant else {
-            return Err(anyhow::anyhow!("Not found or not in Secondary mode"));
-        };
+            .get_secondary_tenant_shard(*tenant_shard_id)
+            .ok_or(GetTenantError::ShardNotFound(*tenant_shard_id))?;
 
         Ok(PendingDownload {
             target_time: None,
diff --git a/pageserver/src/tenant/secondary/heatmap_uploader.rs b/pageserver/src/tenant/secondary/heatmap_uploader.rs
index e680fd705b42..c5e5e0494571 100644
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -28,7 +28,7 @@ use super::{
         self, period_jitter, period_warmup, JobGenerator, RunningJob, SchedulingResult,
         TenantBackgroundJobs,
     },
-    CommandRequest, UploadCommand,
+    CommandRequest, SecondaryTenantError, UploadCommand,
 };
 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, Instrument};
@@ -279,7 +279,10 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
         }.instrument(info_span!(parent: None, "heatmap_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
     }
 
-    fn on_command(&mut self, command: UploadCommand) -> anyhow::Result<UploadPending> {
+    fn on_command(
+        &mut self,
+        command: UploadCommand,
+    ) -> Result<UploadPending, SecondaryTenantError> {
         let tenant_shard_id = command.get_tenant_shard_id();
 
         tracing::info!(
@@ -287,8 +290,7 @@ impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
             "Starting heatmap write on command");
         let tenant = self
             .tenant_manager
-            .get_attached_tenant_shard(*tenant_shard_id)
-            .map_err(|e| anyhow::anyhow!(e))?;
+            .get_attached_tenant_shard(*tenant_shard_id)?;
         if !tenant.is_active() {
             return Err(GetTenantError::NotActive(*tenant_shard_id).into());
         }
diff --git a/pageserver/src/tenant/secondary/scheduler.rs b/pageserver/src/tenant/secondary/scheduler.rs
index 28cf2125dfd0..e963c722b97a 100644
--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -12,7 +12,7 @@ use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use utils::{completion::Barrier, yielding_loop::yielding_loop};
 
-use super::{CommandRequest, CommandResponse};
+use super::{CommandRequest, CommandResponse, SecondaryTenantError};
 
 /// Scheduling interval is the time between calls to JobGenerator::schedule.
 /// When we schedule jobs, the job generator may provide a hint of its preferred
@@ -112,7 +112,7 @@ where
 
     /// Called when a command is received.  A job will be spawned immediately if the return
     /// value is Some, ignoring concurrency limits and the pending queue.
-    fn on_command(&mut self, cmd: CMD) -> anyhow::Result<PJ>;
+    fn on_command(&mut self, cmd: CMD) -> Result<PJ, SecondaryTenantError>;
 }
 
 /// [`JobGenerator`] returns this to provide pending jobs, and hints about scheduling

From dcd016bbfc10666e10fcd9f9f2bce93a2ec2f1f9 Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Wed, 4 Dec 2024 13:58:31 +0100
Subject: [PATCH 34/65] Assign /libs/proxy/ to proxy team (#10003)

---
 CODEOWNERS | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CODEOWNERS b/CODEOWNERS
index 21b0e7c51f0f..f41462c98b1c 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -2,6 +2,7 @@
 /compute_tools/ @neondatabase/control-plane @neondatabase/compute
 /libs/pageserver_api/ @neondatabase/storage
 /libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage
+/libs/proxy/ @neondatabase/proxy
 /libs/remote_storage/ @neondatabase/storage
 /libs/safekeeper_api/ @neondatabase/storage
 /libs/vm_monitor/ @neondatabase/autoscaling

From bd52822e14cd1c62ad4f39ac599964a645c4aa32 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 4 Dec 2024 12:58:35 +0000
Subject: [PATCH 35/65] feat(proxy): add option to forward startup params
 (#9979)

(stacked on #9990 and #9995)

Partially fixes #1287 with a custom option field to enable the fixed
behaviour. This allows us to gradually roll out the fix without silently
changing the observed behaviour for our customers.

related to https://github.com/neondatabase/cloud/issues/15284
---
 Cargo.lock                                    |   4 +-
 Cargo.toml                                    |   2 +-
 libs/pq_proto/src/lib.rs                      |   2 +-
 .../src/authentication/sasl.rs                |   4 +-
 .../src/message/frontend.rs                   |  28 ++-
 libs/proxy/tokio-postgres2/src/codec.rs       |  13 +-
 libs/proxy/tokio-postgres2/src/config.rs      | 191 +++---------------
 libs/proxy/tokio-postgres2/src/connect.rs     |  49 +----
 libs/proxy/tokio-postgres2/src/connect_raw.rs |  31 +--
 proxy/src/cancellation.rs                     |  11 +-
 proxy/src/compute.rs                          |  96 ++++-----
 proxy/src/console_redirect_proxy.rs           |   1 +
 proxy/src/proxy/connect_compute.rs            |   4 +-
 proxy/src/proxy/mod.rs                        |  40 +++-
 proxy/src/proxy/tests/mitm.rs                 |   8 +-
 proxy/src/proxy/tests/mod.rs                  |   2 +-
 proxy/src/serverless/backend.rs               |  11 +-
 test_runner/fixtures/neon_fixtures.py         |   2 +-
 test_runner/regress/test_proxy.py             |  19 ++
 19 files changed, 179 insertions(+), 339 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 5b80ec5e93d8..38158b7aec0c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1031,9 +1031,9 @@ checksum = "14c189c53d098945499cdfa7ecc63567cf3886b3332b312a5b4585d8d3a6a610"
 
 [[package]]
 name = "bytes"
-version = "1.5.0"
+version = "1.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "a2bd12c1caf447e69cd4528f47f94d203fd2582878ecb9e9465484c4148a8223"
+checksum = "325918d6fe32f23b19878fe4b34794ae41fc19ddbe53b10571a4874d44ffd39b"
 dependencies = [
  "serde",
 ]
diff --git a/Cargo.toml b/Cargo.toml
index 91fa6a260768..a35823e0c2c4 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -74,7 +74,7 @@ bindgen = "0.70"
 bit_field = "0.10.2"
 bstr = "1.0"
 byteorder = "1.4"
-bytes = "1.0"
+bytes = "1.9"
 camino = "1.1.6"
 cfg-if = "1.0.0"
 chrono = { version = "0.4", default-features = false, features = ["clock"] }
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index 43dfbc22a45a..94714359a3d8 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -100,7 +100,7 @@ impl StartupMessageParamsBuilder {
 
 #[derive(Debug, Clone, Default)]
 pub struct StartupMessageParams {
-    params: Bytes,
+    pub params: Bytes,
 }
 
 impl StartupMessageParams {
diff --git a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs
index 19aa3c1e9aaa..f2200a40ce59 100644
--- a/libs/proxy/postgres-protocol2/src/authentication/sasl.rs
+++ b/libs/proxy/postgres-protocol2/src/authentication/sasl.rs
@@ -117,7 +117,7 @@ enum Credentials<const N: usize> {
     /// A regular password as a vector of bytes.
     Password(Vec<u8>),
     /// A precomputed pair of keys.
-    Keys(Box<ScramKeys<N>>),
+    Keys(ScramKeys<N>),
 }
 
 enum State {
@@ -176,7 +176,7 @@ impl ScramSha256 {
 
     /// Constructs a new instance which will use the provided key pair for authentication.
     pub fn new_with_keys(keys: ScramKeys<32>, channel_binding: ChannelBinding) -> ScramSha256 {
-        let password = Credentials::Keys(keys.into());
+        let password = Credentials::Keys(keys);
         ScramSha256::new_inner(password, channel_binding, nonce())
     }
 
diff --git a/libs/proxy/postgres-protocol2/src/message/frontend.rs b/libs/proxy/postgres-protocol2/src/message/frontend.rs
index 5d0a8ff8c838..bc6168f33732 100644
--- a/libs/proxy/postgres-protocol2/src/message/frontend.rs
+++ b/libs/proxy/postgres-protocol2/src/message/frontend.rs
@@ -255,22 +255,34 @@ pub fn ssl_request(buf: &mut BytesMut) {
 }
 
 #[inline]
-pub fn startup_message<'a, I>(parameters: I, buf: &mut BytesMut) -> io::Result<()>
-where
-    I: IntoIterator<Item = (&'a str, &'a str)>,
-{
+pub fn startup_message(parameters: &StartupMessageParams, buf: &mut BytesMut) -> io::Result<()> {
     write_body(buf, |buf| {
         // postgres protocol version 3.0(196608) in bigger-endian
         buf.put_i32(0x00_03_00_00);
-        for (key, value) in parameters {
-            write_cstr(key.as_bytes(), buf)?;
-            write_cstr(value.as_bytes(), buf)?;
-        }
+        buf.put_slice(&parameters.params);
         buf.put_u8(0);
         Ok(())
     })
 }
 
+#[derive(Debug, Clone, Default, PartialEq, Eq)]
+pub struct StartupMessageParams {
+    pub params: BytesMut,
+}
+
+impl StartupMessageParams {
+    /// Set parameter's value by its name.
+    pub fn insert(&mut self, name: &str, value: &str) {
+        if name.contains('\0') || value.contains('\0') {
+            panic!("startup parameter name or value contained a null")
+        }
+        self.params.put_slice(name.as_bytes());
+        self.params.put_u8(0);
+        self.params.put_slice(value.as_bytes());
+        self.params.put_u8(0);
+    }
+}
+
 #[inline]
 pub fn sync(buf: &mut BytesMut) {
     buf.put_u8(b'S');
diff --git a/libs/proxy/tokio-postgres2/src/codec.rs b/libs/proxy/tokio-postgres2/src/codec.rs
index 7412db785b0a..0ec46198ce42 100644
--- a/libs/proxy/tokio-postgres2/src/codec.rs
+++ b/libs/proxy/tokio-postgres2/src/codec.rs
@@ -35,9 +35,7 @@ impl FallibleIterator for BackendMessages {
     }
 }
 
-pub struct PostgresCodec {
-    pub max_message_size: Option<usize>,
-}
+pub struct PostgresCodec;
 
 impl Encoder<FrontendMessage> for PostgresCodec {
     type Error = io::Error;
@@ -66,15 +64,6 @@ impl Decoder for PostgresCodec {
                 break;
             }
 
-            if let Some(max) = self.max_message_size {
-                if len > max {
-                    return Err(io::Error::new(
-                        io::ErrorKind::InvalidInput,
-                        "message too large",
-                    ));
-                }
-            }
-
             match header.tag() {
                 backend::NOTICE_RESPONSE_TAG
                 | backend::NOTIFICATION_RESPONSE_TAG
diff --git a/libs/proxy/tokio-postgres2/src/config.rs b/libs/proxy/tokio-postgres2/src/config.rs
index fd10ef6f207d..11a361a81b66 100644
--- a/libs/proxy/tokio-postgres2/src/config.rs
+++ b/libs/proxy/tokio-postgres2/src/config.rs
@@ -6,6 +6,7 @@ use crate::connect_raw::RawConnection;
 use crate::tls::MakeTlsConnect;
 use crate::tls::TlsConnect;
 use crate::{Client, Connection, Error};
+use postgres_protocol2::message::frontend::StartupMessageParams;
 use std::fmt;
 use std::str;
 use std::time::Duration;
@@ -14,16 +15,6 @@ use tokio::io::{AsyncRead, AsyncWrite};
 pub use postgres_protocol2::authentication::sasl::ScramKeys;
 use tokio::net::TcpStream;
 
-/// Properties required of a session.
-#[derive(Debug, Copy, Clone, PartialEq, Eq)]
-#[non_exhaustive]
-pub enum TargetSessionAttrs {
-    /// No special properties are required.
-    Any,
-    /// The session must allow writes.
-    ReadWrite,
-}
-
 /// TLS configuration.
 #[derive(Debug, Copy, Clone, PartialEq, Eq)]
 #[non_exhaustive]
@@ -73,94 +64,20 @@ pub enum AuthKeys {
 }
 
 /// Connection configuration.
-///
-/// Configuration can be parsed from libpq-style connection strings. These strings come in two formats:
-///
-/// # Key-Value
-///
-/// This format consists of space-separated key-value pairs. Values which are either the empty string or contain
-/// whitespace should be wrapped in `'`. `'` and `\` characters should be backslash-escaped.
-///
-/// ## Keys
-///
-/// * `user` - The username to authenticate with. Required.
-/// * `password` - The password to authenticate with.
-/// * `dbname` - The name of the database to connect to. Defaults to the username.
-/// * `options` - Command line options used to configure the server.
-/// * `application_name` - Sets the `application_name` parameter on the server.
-/// * `sslmode` - Controls usage of TLS. If set to `disable`, TLS will not be used. If set to `prefer`, TLS will be used
-///     if available, but not used otherwise. If set to `require`, TLS will be forced to be used. Defaults to `prefer`.
-/// * `host` - The host to connect to. On Unix platforms, if the host starts with a `/` character it is treated as the
-///     path to the directory containing Unix domain sockets. Otherwise, it is treated as a hostname. Multiple hosts
-///     can be specified, separated by commas. Each host will be tried in turn when connecting. Required if connecting
-///     with the `connect` method.
-/// * `port` - The port to connect to. Multiple ports can be specified, separated by commas. The number of ports must be
-///     either 1, in which case it will be used for all hosts, or the same as the number of hosts. Defaults to 5432 if
-///     omitted or the empty string.
-/// * `connect_timeout` - The time limit in seconds applied to each socket-level connection attempt. Note that hostnames
-///     can resolve to multiple IP addresses, and this limit is applied to each address. Defaults to no timeout.
-/// * `target_session_attrs` - Specifies requirements of the session. If set to `read-write`, the client will check that
-///     the `transaction_read_write` session parameter is set to `on`. This can be used to connect to the primary server
-///     in a database cluster as opposed to the secondary read-only mirrors. Defaults to `all`.
-/// * `channel_binding` - Controls usage of channel binding in the authentication process. If set to `disable`, channel
-///     binding will not be used. If set to `prefer`, channel binding will be used if available, but not used otherwise.
-///     If set to `require`, the authentication process will fail if channel binding is not used. Defaults to `prefer`.
-///
-/// ## Examples
-///
-/// ```not_rust
-/// host=localhost user=postgres connect_timeout=10 keepalives=0
-/// ```
-///
-/// ```not_rust
-/// host=/var/lib/postgresql,localhost port=1234 user=postgres password='password with spaces'
-/// ```
-///
-/// ```not_rust
-/// host=host1,host2,host3 port=1234,,5678 user=postgres target_session_attrs=read-write
-/// ```
-///
-/// # Url
-///
-/// This format resembles a URL with a scheme of either `postgres://` or `postgresql://`. All components are optional,
-/// and the format accepts query parameters for all of the key-value pairs described in the section above. Multiple
-/// host/port pairs can be comma-separated. Unix socket paths in the host section of the URL should be percent-encoded,
-/// as the path component of the URL specifies the database name.
-///
-/// ## Examples
-///
-/// ```not_rust
-/// postgresql://user@localhost
-/// ```
-///
-/// ```not_rust
-/// postgresql://user:password@%2Fvar%2Flib%2Fpostgresql/mydb?connect_timeout=10
-/// ```
-///
-/// ```not_rust
-/// postgresql://user@host1:1234,host2,host3:5678?target_session_attrs=read-write
-/// ```
-///
-/// ```not_rust
-/// postgresql:///mydb?user=user&host=/var/lib/postgresql
-/// ```
 #[derive(Clone, PartialEq, Eq)]
 pub struct Config {
     pub(crate) host: Host,
     pub(crate) port: u16,
 
-    pub(crate) user: Option<String>,
     pub(crate) password: Option<Vec<u8>>,
     pub(crate) auth_keys: Option<Box<AuthKeys>>,
-    pub(crate) dbname: Option<String>,
-    pub(crate) options: Option<String>,
-    pub(crate) application_name: Option<String>,
     pub(crate) ssl_mode: SslMode,
     pub(crate) connect_timeout: Option<Duration>,
-    pub(crate) target_session_attrs: TargetSessionAttrs,
     pub(crate) channel_binding: ChannelBinding,
-    pub(crate) replication_mode: Option<ReplicationMode>,
-    pub(crate) max_backend_message_size: Option<usize>,
+    pub(crate) server_params: StartupMessageParams,
+
+    database: bool,
+    username: bool,
 }
 
 impl Config {
@@ -169,18 +86,15 @@ impl Config {
         Config {
             host: Host::Tcp(host),
             port,
-            user: None,
             password: None,
             auth_keys: None,
-            dbname: None,
-            options: None,
-            application_name: None,
             ssl_mode: SslMode::Prefer,
             connect_timeout: None,
-            target_session_attrs: TargetSessionAttrs::Any,
             channel_binding: ChannelBinding::Prefer,
-            replication_mode: None,
-            max_backend_message_size: None,
+            server_params: StartupMessageParams::default(),
+
+            database: false,
+            username: false,
         }
     }
 
@@ -188,14 +102,13 @@ impl Config {
     ///
     /// Required.
     pub fn user(&mut self, user: &str) -> &mut Config {
-        self.user = Some(user.to_string());
-        self
+        self.set_param("user", user)
     }
 
     /// Gets the user to authenticate with, if one has been configured with
     /// the `user` method.
-    pub fn get_user(&self) -> Option<&str> {
-        self.user.as_deref()
+    pub fn user_is_set(&self) -> bool {
+        self.username
     }
 
     /// Sets the password to authenticate with.
@@ -231,40 +144,26 @@ impl Config {
     ///
     /// Defaults to the user.
     pub fn dbname(&mut self, dbname: &str) -> &mut Config {
-        self.dbname = Some(dbname.to_string());
-        self
+        self.set_param("database", dbname)
     }
 
     /// Gets the name of the database to connect to, if one has been configured
     /// with the `dbname` method.
-    pub fn get_dbname(&self) -> Option<&str> {
-        self.dbname.as_deref()
+    pub fn db_is_set(&self) -> bool {
+        self.database
     }
 
-    /// Sets command line options used to configure the server.
-    pub fn options(&mut self, options: &str) -> &mut Config {
-        self.options = Some(options.to_string());
-        self
-    }
-
-    /// Gets the command line options used to configure the server, if the
-    /// options have been set with the `options` method.
-    pub fn get_options(&self) -> Option<&str> {
-        self.options.as_deref()
-    }
+    pub fn set_param(&mut self, name: &str, value: &str) -> &mut Config {
+        if name == "database" {
+            self.database = true;
+        } else if name == "user" {
+            self.username = true;
+        }
 
-    /// Sets the value of the `application_name` runtime parameter.
-    pub fn application_name(&mut self, application_name: &str) -> &mut Config {
-        self.application_name = Some(application_name.to_string());
+        self.server_params.insert(name, value);
         self
     }
 
-    /// Gets the value of the `application_name` runtime parameter, if it has
-    /// been set with the `application_name` method.
-    pub fn get_application_name(&self) -> Option<&str> {
-        self.application_name.as_deref()
-    }
-
     /// Sets the SSL configuration.
     ///
     /// Defaults to `prefer`.
@@ -303,23 +202,6 @@ impl Config {
         self.connect_timeout.as_ref()
     }
 
-    /// Sets the requirements of the session.
-    ///
-    /// This can be used to connect to the primary server in a clustered database rather than one of the read-only
-    /// secondary servers. Defaults to `Any`.
-    pub fn target_session_attrs(
-        &mut self,
-        target_session_attrs: TargetSessionAttrs,
-    ) -> &mut Config {
-        self.target_session_attrs = target_session_attrs;
-        self
-    }
-
-    /// Gets the requirements of the session.
-    pub fn get_target_session_attrs(&self) -> TargetSessionAttrs {
-        self.target_session_attrs
-    }
-
     /// Sets the channel binding behavior.
     ///
     /// Defaults to `prefer`.
@@ -333,28 +215,6 @@ impl Config {
         self.channel_binding
     }
 
-    /// Set replication mode.
-    pub fn replication_mode(&mut self, replication_mode: ReplicationMode) -> &mut Config {
-        self.replication_mode = Some(replication_mode);
-        self
-    }
-
-    /// Get replication mode.
-    pub fn get_replication_mode(&self) -> Option<ReplicationMode> {
-        self.replication_mode
-    }
-
-    /// Set limit for backend messages size.
-    pub fn max_backend_message_size(&mut self, max_backend_message_size: usize) -> &mut Config {
-        self.max_backend_message_size = Some(max_backend_message_size);
-        self
-    }
-
-    /// Get limit for backend messages size.
-    pub fn get_max_backend_message_size(&self) -> Option<usize> {
-        self.max_backend_message_size
-    }
-
     /// Opens a connection to a PostgreSQL database.
     ///
     /// Requires the `runtime` Cargo feature (enabled by default).
@@ -392,18 +252,13 @@ impl fmt::Debug for Config {
         }
 
         f.debug_struct("Config")
-            .field("user", &self.user)
             .field("password", &self.password.as_ref().map(|_| Redaction {}))
-            .field("dbname", &self.dbname)
-            .field("options", &self.options)
-            .field("application_name", &self.application_name)
             .field("ssl_mode", &self.ssl_mode)
             .field("host", &self.host)
             .field("port", &self.port)
             .field("connect_timeout", &self.connect_timeout)
-            .field("target_session_attrs", &self.target_session_attrs)
             .field("channel_binding", &self.channel_binding)
-            .field("replication", &self.replication_mode)
+            .field("server_params", &self.server_params)
             .finish()
     }
 }
diff --git a/libs/proxy/tokio-postgres2/src/connect.rs b/libs/proxy/tokio-postgres2/src/connect.rs
index 75a58e6eacc9..e0cb69748d50 100644
--- a/libs/proxy/tokio-postgres2/src/connect.rs
+++ b/libs/proxy/tokio-postgres2/src/connect.rs
@@ -1,14 +1,11 @@
 use crate::client::SocketConfig;
 use crate::codec::BackendMessage;
-use crate::config::{Host, TargetSessionAttrs};
+use crate::config::Host;
 use crate::connect_raw::connect_raw;
 use crate::connect_socket::connect_socket;
 use crate::tls::{MakeTlsConnect, TlsConnect};
-use crate::{Client, Config, Connection, Error, RawConnection, SimpleQueryMessage};
-use futures_util::{future, pin_mut, Future, FutureExt, Stream};
+use crate::{Client, Config, Connection, Error, RawConnection};
 use postgres_protocol2::message::backend::Message;
-use std::io;
-use std::task::Poll;
 use tokio::net::TcpStream;
 use tokio::sync::mpsc;
 
@@ -72,47 +69,7 @@ where
         .map(|m| BackendMessage::Async(Message::NoticeResponse(m)))
         .collect();
 
-    let mut connection = Connection::new(stream, delayed, parameters, receiver);
-
-    if let TargetSessionAttrs::ReadWrite = config.target_session_attrs {
-        let rows = client.simple_query_raw("SHOW transaction_read_only");
-        pin_mut!(rows);
-
-        let rows = future::poll_fn(|cx| {
-            if connection.poll_unpin(cx)?.is_ready() {
-                return Poll::Ready(Err(Error::closed()));
-            }
-
-            rows.as_mut().poll(cx)
-        })
-        .await?;
-        pin_mut!(rows);
-
-        loop {
-            let next = future::poll_fn(|cx| {
-                if connection.poll_unpin(cx)?.is_ready() {
-                    return Poll::Ready(Some(Err(Error::closed())));
-                }
-
-                rows.as_mut().poll_next(cx)
-            });
-
-            match next.await.transpose()? {
-                Some(SimpleQueryMessage::Row(row)) => {
-                    if row.try_get(0)? == Some("on") {
-                        return Err(Error::connect(io::Error::new(
-                            io::ErrorKind::PermissionDenied,
-                            "database does not allow writes",
-                        )));
-                    } else {
-                        break;
-                    }
-                }
-                Some(_) => {}
-                None => return Err(Error::unexpected_message()),
-            }
-        }
-    }
+    let connection = Connection::new(stream, delayed, parameters, receiver);
 
     Ok((client, connection))
 }
diff --git a/libs/proxy/tokio-postgres2/src/connect_raw.rs b/libs/proxy/tokio-postgres2/src/connect_raw.rs
index 390f133002be..66db85e07d24 100644
--- a/libs/proxy/tokio-postgres2/src/connect_raw.rs
+++ b/libs/proxy/tokio-postgres2/src/connect_raw.rs
@@ -1,5 +1,5 @@
 use crate::codec::{BackendMessage, BackendMessages, FrontendMessage, PostgresCodec};
-use crate::config::{self, AuthKeys, Config, ReplicationMode};
+use crate::config::{self, AuthKeys, Config};
 use crate::connect_tls::connect_tls;
 use crate::maybe_tls_stream::MaybeTlsStream;
 use crate::tls::{TlsConnect, TlsStream};
@@ -96,12 +96,7 @@ where
     let stream = connect_tls(stream, config.ssl_mode, tls).await?;
 
     let mut stream = StartupStream {
-        inner: Framed::new(
-            stream,
-            PostgresCodec {
-                max_message_size: config.max_backend_message_size,
-            },
-        ),
+        inner: Framed::new(stream, PostgresCodec),
         buf: BackendMessages::empty(),
         delayed_notice: Vec::new(),
     };
@@ -124,28 +119,8 @@ where
     S: AsyncRead + AsyncWrite + Unpin,
     T: AsyncRead + AsyncWrite + Unpin,
 {
-    let mut params = vec![("client_encoding", "UTF8")];
-    if let Some(user) = &config.user {
-        params.push(("user", &**user));
-    }
-    if let Some(dbname) = &config.dbname {
-        params.push(("database", &**dbname));
-    }
-    if let Some(options) = &config.options {
-        params.push(("options", &**options));
-    }
-    if let Some(application_name) = &config.application_name {
-        params.push(("application_name", &**application_name));
-    }
-    if let Some(replication_mode) = &config.replication_mode {
-        match replication_mode {
-            ReplicationMode::Physical => params.push(("replication", "true")),
-            ReplicationMode::Logical => params.push(("replication", "database")),
-        }
-    }
-
     let mut buf = BytesMut::new();
-    frontend::startup_message(params, &mut buf).map_err(Error::encode)?;
+    frontend::startup_message(&config.server_params, &mut buf).map_err(Error::encode)?;
 
     stream
         .send(FrontendMessage::Raw(buf.freeze()))
diff --git a/proxy/src/cancellation.rs b/proxy/src/cancellation.rs
index bcb0ef40bd74..7bc5587a2535 100644
--- a/proxy/src/cancellation.rs
+++ b/proxy/src/cancellation.rs
@@ -70,11 +70,12 @@ impl ReportableError for CancelError {
 impl<P: CancellationPublisher> CancellationHandler<P> {
     /// Run async action within an ephemeral session identified by [`CancelKeyData`].
     pub(crate) fn get_session(self: Arc<Self>) -> Session<P> {
-        // HACK: We'd rather get the real backend_pid but postgres_client doesn't
-        // expose it and we don't want to do another roundtrip to query
-        // for it. The client will be able to notice that this is not the
-        // actual backend_pid, but backend_pid is not used for anything
-        // so it doesn't matter.
+        // we intentionally generate a random "backend pid" and "secret key" here.
+        // we use the corresponding u64 as an identifier for the
+        // actual endpoint+pid+secret for postgres/pgbouncer.
+        //
+        // if we forwarded the backend_pid from postgres to the client, there would be a lot
+        // of overlap between our computes as most pids are small (~100).
         let key = loop {
             let key = rand::random();
 
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index ab0ff4b7950a..4113b5bb80e3 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -131,49 +131,37 @@ impl ConnCfg {
     }
 
     /// Apply startup message params to the connection config.
-    pub(crate) fn set_startup_params(&mut self, params: &StartupMessageParams) {
-        // Only set `user` if it's not present in the config.
-        // Console redirect auth flow takes username from the console's response.
-        if let (None, Some(user)) = (self.get_user(), params.get("user")) {
-            self.user(user);
+    pub(crate) fn set_startup_params(
+        &mut self,
+        params: &StartupMessageParams,
+        arbitrary_params: bool,
+    ) {
+        if !arbitrary_params {
+            self.set_param("client_encoding", "UTF8");
         }
-
-        // Only set `dbname` if it's not present in the config.
-        // Console redirect auth flow takes dbname from the console's response.
-        if let (None, Some(dbname)) = (self.get_dbname(), params.get("database")) {
-            self.dbname(dbname);
-        }
-
-        // Don't add `options` if they were only used for specifying a project.
-        // Connection pools don't support `options`, because they affect backend startup.
-        if let Some(options) = filtered_options(params) {
-            self.options(&options);
-        }
-
-        if let Some(app_name) = params.get("application_name") {
-            self.application_name(app_name);
-        }
-
-        // TODO: This is especially ugly...
-        if let Some(replication) = params.get("replication") {
-            use postgres_client::config::ReplicationMode;
-            match replication {
-                "true" | "on" | "yes" | "1" => {
-                    self.replication_mode(ReplicationMode::Physical);
+        for (k, v) in params.iter() {
+            match k {
+                // Only set `user` if it's not present in the config.
+                // Console redirect auth flow takes username from the console's response.
+                "user" if self.user_is_set() => continue,
+                "database" if self.db_is_set() => continue,
+                "options" => {
+                    if let Some(options) = filtered_options(v) {
+                        self.set_param(k, &options);
+                    }
                 }
-                "database" => {
-                    self.replication_mode(ReplicationMode::Logical);
+                "user" | "database" | "application_name" | "replication" => {
+                    self.set_param(k, v);
                 }
-                _other => {}
+
+                // if we allow arbitrary params, then we forward them through.
+                // this is a flag for a period of backwards compatibility
+                k if arbitrary_params => {
+                    self.set_param(k, v);
+                }
+                _ => {}
             }
         }
-
-        // TODO: extend the list of the forwarded startup parameters.
-        // Currently, tokio-postgres doesn't allow us to pass
-        // arbitrary parameters, but the ones above are a good start.
-        //
-        // This and the reverse params problem can be better addressed
-        // in a bespoke connection machinery (a new library for that sake).
     }
 }
 
@@ -347,10 +335,9 @@ impl ConnCfg {
 }
 
 /// Retrieve `options` from a startup message, dropping all proxy-secific flags.
-fn filtered_options(params: &StartupMessageParams) -> Option<String> {
+fn filtered_options(options: &str) -> Option<String> {
     #[allow(unstable_name_collisions)]
-    let options: String = params
-        .options_raw()?
+    let options: String = StartupMessageParams::parse_options_raw(options)
         .filter(|opt| parse_endpoint_param(opt).is_none() && neon_option(opt).is_none())
         .intersperse(" ") // TODO: use impl from std once it's stabilized
         .collect();
@@ -427,27 +414,24 @@ mod tests {
     #[test]
     fn test_filtered_options() {
         // Empty options is unlikely to be useful anyway.
-        let params = StartupMessageParams::new([("options", "")]);
-        assert_eq!(filtered_options(&params), None);
+        let params = "";
+        assert_eq!(filtered_options(params), None);
 
         // It's likely that clients will only use options to specify endpoint/project.
-        let params = StartupMessageParams::new([("options", "project=foo")]);
-        assert_eq!(filtered_options(&params), None);
+        let params = "project=foo";
+        assert_eq!(filtered_options(params), None);
 
         // Same, because unescaped whitespaces are no-op.
-        let params = StartupMessageParams::new([("options", " project=foo ")]);
-        assert_eq!(filtered_options(&params).as_deref(), None);
+        let params = " project=foo ";
+        assert_eq!(filtered_options(params).as_deref(), None);
 
-        let params = StartupMessageParams::new([("options", r"\  project=foo \ ")]);
-        assert_eq!(filtered_options(&params).as_deref(), Some(r"\  \ "));
+        let params = r"\  project=foo \ ";
+        assert_eq!(filtered_options(params).as_deref(), Some(r"\  \ "));
 
-        let params = StartupMessageParams::new([("options", "project = foo")]);
-        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
+        let params = "project = foo";
+        assert_eq!(filtered_options(params).as_deref(), Some("project = foo"));
 
-        let params = StartupMessageParams::new([(
-            "options",
-            "project = foo neon_endpoint_type:read_write   neon_lsn:0/2",
-        )]);
-        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
+        let params = "project = foo neon_endpoint_type:read_write   neon_lsn:0/2 neon_proxy_params_compat:true";
+        assert_eq!(filtered_options(params).as_deref(), Some("project = foo"));
     }
 }
diff --git a/proxy/src/console_redirect_proxy.rs b/proxy/src/console_redirect_proxy.rs
index 8f78df19649b..7db1179eeae8 100644
--- a/proxy/src/console_redirect_proxy.rs
+++ b/proxy/src/console_redirect_proxy.rs
@@ -206,6 +206,7 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     let mut node = connect_to_compute(
         ctx,
         &TcpMechanism {
+            params_compat: true,
             params: &params,
             locks: &config.connect_compute_locks,
         },
diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs
index 585dce7baeb8..a3027abd7cae 100644
--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -66,6 +66,8 @@ pub(crate) trait ComputeConnectBackend {
 }
 
 pub(crate) struct TcpMechanism<'a> {
+    pub(crate) params_compat: bool,
+
     /// KV-dictionary with PostgreSQL connection params.
     pub(crate) params: &'a StartupMessageParams,
 
@@ -92,7 +94,7 @@ impl ConnectMechanism for TcpMechanism<'_> {
     }
 
     fn update_connect_config(&self, config: &mut compute::ConnCfg) {
-        config.set_startup_params(self.params);
+        config.set_startup_params(self.params, self.params_compat);
     }
 }
 
diff --git a/proxy/src/proxy/mod.rs b/proxy/src/proxy/mod.rs
index af97fb3d7159..f74eb5940fb2 100644
--- a/proxy/src/proxy/mod.rs
+++ b/proxy/src/proxy/mod.rs
@@ -338,9 +338,17 @@ pub(crate) async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
         }
     };
 
+    let params_compat = match &user_info {
+        auth::Backend::ControlPlane(_, info) => {
+            info.info.options.get(NeonOptions::PARAMS_COMPAT).is_some()
+        }
+        auth::Backend::Local(_) => false,
+    };
+
     let mut node = connect_to_compute(
         ctx,
         &TcpMechanism {
+            params_compat,
             params: &params,
             locks: &config.connect_compute_locks,
         },
@@ -409,19 +417,47 @@ pub(crate) async fn prepare_client_connection<P>(
 pub(crate) struct NeonOptions(Vec<(SmolStr, SmolStr)>);
 
 impl NeonOptions {
+    // proxy options:
+
+    /// `PARAMS_COMPAT` allows opting in to forwarding all startup parameters from client to compute.
+    const PARAMS_COMPAT: &str = "proxy_params_compat";
+
+    // cplane options:
+
+    /// `LSN` allows provisioning an ephemeral compute with time-travel to the provided LSN.
+    const LSN: &str = "lsn";
+
+    /// `ENDPOINT_TYPE` allows configuring an ephemeral compute to be read_only or read_write.
+    const ENDPOINT_TYPE: &str = "endpoint_type";
+
     pub(crate) fn parse_params(params: &StartupMessageParams) -> Self {
         params
             .options_raw()
             .map(Self::parse_from_iter)
             .unwrap_or_default()
     }
+
     pub(crate) fn parse_options_raw(options: &str) -> Self {
         Self::parse_from_iter(StartupMessageParams::parse_options_raw(options))
     }
 
+    pub(crate) fn get(&self, key: &str) -> Option<SmolStr> {
+        self.0
+            .iter()
+            .find_map(|(k, v)| (k == key).then_some(v))
+            .cloned()
+    }
+
     pub(crate) fn is_ephemeral(&self) -> bool {
-        // Currently, neon endpoint options are all reserved for ephemeral endpoints.
-        !self.0.is_empty()
+        self.0.iter().any(|(k, _)| match &**k {
+            // This is not a cplane option, we know it does not create ephemeral computes.
+            Self::PARAMS_COMPAT => false,
+            Self::LSN => true,
+            Self::ENDPOINT_TYPE => true,
+            // err on the side of caution. any cplane options we don't know about
+            // might lead to ephemeral computes.
+            _ => true,
+        })
     }
 
     fn parse_from_iter<'a>(options: impl Iterator<Item = &'a str>) -> Self {
diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs
index d72331c7bf78..59c9ac27b838 100644
--- a/proxy/src/proxy/tests/mitm.rs
+++ b/proxy/src/proxy/tests/mitm.rs
@@ -55,7 +55,13 @@ async fn proxy_mitm(
 
         // give the end_server the startup parameters
         let mut buf = BytesMut::new();
-        frontend::startup_message(startup.iter(), &mut buf).unwrap();
+        frontend::startup_message(
+            &postgres_protocol::message::frontend::StartupMessageParams {
+                params: startup.params.into(),
+            },
+            &mut buf,
+        )
+        .unwrap();
         end_server.send(buf.freeze()).await.unwrap();
 
         // proxy messages between end_client and end_server
diff --git a/proxy/src/proxy/tests/mod.rs b/proxy/src/proxy/tests/mod.rs
index 53345431e3cf..911b349416f2 100644
--- a/proxy/src/proxy/tests/mod.rs
+++ b/proxy/src/proxy/tests/mod.rs
@@ -252,7 +252,7 @@ async fn handshake_raw() -> anyhow::Result<()> {
     let _conn = postgres_client::Config::new("test".to_owned(), 5432)
         .user("john_doe")
         .dbname("earth")
-        .options("project=generic-project-name")
+        .set_param("options", "project=generic-project-name")
         .ssl_mode(SslMode::Prefer)
         .connect_raw(server, NoTls)
         .await?;
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 55d2e47fd3f2..251aa470843d 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -309,10 +309,13 @@ impl PoolingBackend {
             .config
             .user(&conn_info.user_info.user)
             .dbname(&conn_info.dbname)
-            .options(&format!(
-                "-c pg_session_jwt.jwk={}",
-                serde_json::to_string(&jwk).expect("serializing jwk to json should not fail")
-            ));
+            .set_param(
+                "options",
+                &format!(
+                    "-c pg_session_jwt.jwk={}",
+                    serde_json::to_string(&jwk).expect("serializing jwk to json should not fail")
+                ),
+            );
 
         let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Compute);
         let (client, connection) = config.connect(postgres_client::NoTls).await?;
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 9c579373e8ea..60c4a2393609 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -269,7 +269,7 @@ async def connect_async(self, **kwargs: Any) -> asyncpg.Connection:
             for match in re.finditer(r"-c(\w*)=(\w*)", options):
                 key = match.group(1)
                 val = match.group(2)
-                if "server_options" in conn_options:
+                if "server_settings" in conn_options:
                     conn_options["server_settings"].update({key: val})
                 else:
                     conn_options["server_settings"] = {key: val}
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index 5a01d90d8548..d8df2efc78f2 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -5,6 +5,7 @@
 import subprocess
 import time
 import urllib.parse
+from contextlib import closing
 from typing import TYPE_CHECKING
 
 import psycopg2
@@ -131,6 +132,24 @@ def test_proxy_options(static_proxy: NeonProxy, option_name: str):
     assert out[0][0] == " str"
 
 
+@pytest.mark.asyncio
+async def test_proxy_arbitrary_params(static_proxy: NeonProxy):
+    with closing(
+        await static_proxy.connect_async(server_settings={"IntervalStyle": "iso_8601"})
+    ) as conn:
+        out = await conn.fetchval("select to_json('0 seconds'::interval)")
+        assert out == '"00:00:00"'
+
+    options = "neon_proxy_params_compat:true"
+    with closing(
+        await static_proxy.connect_async(
+            server_settings={"IntervalStyle": "iso_8601", "options": options}
+        )
+    ) as conn:
+        out = await conn.fetchval("select to_json('0 seconds'::interval)")
+        assert out == '"PT0S"'
+
+
 def test_auth_errors(static_proxy: NeonProxy):
     """
     Check that we throw very specific errors in some unsuccessful auth scenarios.

From 9a4157dadbf463ce479d68f3663824b4400d7f9a Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Wed, 4 Dec 2024 14:05:31 +0100
Subject: [PATCH 36/65] feat(compute): Set default application_name for
 pgbouncer connections (#9973)

## Problem

When client specifies `application_name`, pgbouncer propagates it to the
Postgres. Yet, if client doesn't do it, we have hard time figuring out
who opens a lot of Postgres connections (including the `cloud_admin`
ones).

See this investigation as an example:
https://neondb.slack.com/archives/C0836R0RZ0D

## Summary of changes

I haven't found this documented, but it looks like pgbouncer accepts
standard Postgres connstring parameters in the connstring in the
`[databases]` section, so put the default `application_name=pgbouncer`
there. That way, we will always see who opens Postgres connections. I
did tests, and if client specifies a `application_name`, pgbouncer
overrides this default, so it only works if it's not specified or set to
blank `&application_name=` in the connection string.

This is the last place we could potentially open some Postgres
connections without `application_name`. Everything else should be either
of two:
1. Direct client connections without `application_name`, but these
should be strictly non-`cloud_admin` ones
2. Some ad-hoc internal connections, so if we see spikes of unidentified
`cloud_admin` connections, we will need to investigate it again.

Fixes neondatabase/cloud#20948
---
 compute/etc/pgbouncer.ini | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/compute/etc/pgbouncer.ini b/compute/etc/pgbouncer.ini
index cb994f961c87..abcd1656361f 100644
--- a/compute/etc/pgbouncer.ini
+++ b/compute/etc/pgbouncer.ini
@@ -1,5 +1,9 @@
 [databases]
-*=host=localhost port=5432 auth_user=cloud_admin
+;; pgbouncer propagates application_name (if it's specified) to the server, but some
+;; clients don't set it. We set default application_name=pgbouncer to make it
+;; easier to identify pgbouncer connections in Postgres. If client sets
+;; application_name, it will be used instead.
+*=host=localhost port=5432 auth_user=cloud_admin application_name=pgbouncer
 [pgbouncer]
 listen_port=6432
 listen_addr=0.0.0.0

From 699a213c5d8684d4a78bf78af47a790c00921384 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 4 Dec 2024 14:05:53 +0100
Subject: [PATCH 37/65] Display reqwest error source (#10004)

## Problem

Reqwest errors don't include details about the inner source error. This
means that we get opaque errors like:

```
receive body: error sending request for url (http://localhost:9898/v1/location_config)
```

Instead of the more helpful:

```
receive body: error sending request for url (http://localhost:9898/v1/location_config): operation timed out
```

Touches #9801.

## Summary of changes

Include the source error for `reqwest::Error` wherever it's displayed.
---
 control_plane/src/safekeeper.rs              |  3 ++-
 pageserver/client/src/mgmt_api.rs            |  6 +++---
 pageserver/src/consumption_metrics/upload.rs |  7 ++++++-
 safekeeper/src/http/client.rs                |  3 ++-
 storage_controller/src/compute_hook.rs       |  3 ++-
 storage_controller/src/peer_client.rs        | 11 ++++++++---
 storage_scrubber/src/cloud_admin_api.rs      | 14 ++++++++++----
 7 files changed, 33 insertions(+), 14 deletions(-)

diff --git a/control_plane/src/safekeeper.rs b/control_plane/src/safekeeper.rs
index 7a019bce886a..f0c37229254c 100644
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -5,6 +5,7 @@
 //! ```text
 //!   .neon/safekeepers/<safekeeper id>
 //! ```
+use std::error::Error as _;
 use std::future::Future;
 use std::io::Write;
 use std::path::PathBuf;
@@ -26,7 +27,7 @@ use crate::{
 
 #[derive(Error, Debug)]
 pub enum SafekeeperHttpError {
-    #[error("Reqwest error: {0}")]
+    #[error("request error: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())]
     Transport(#[from] reqwest::Error),
 
     #[error("Error: {0}")]
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 4d76c66905c4..c3a1ef8140cb 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,4 +1,4 @@
-use std::collections::HashMap;
+use std::{collections::HashMap, error::Error as _};
 
 use bytes::Bytes;
 use detach_ancestor::AncestorDetached;
@@ -25,10 +25,10 @@ pub struct Client {
 
 #[derive(thiserror::Error, Debug)]
 pub enum Error {
-    #[error("send request: {0}")]
+    #[error("send request: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())]
     SendRequest(reqwest::Error),
 
-    #[error("receive body: {0}")]
+    #[error("receive body: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())]
     ReceiveBody(reqwest::Error),
 
     #[error("receive error body: {0}")]
diff --git a/pageserver/src/consumption_metrics/upload.rs b/pageserver/src/consumption_metrics/upload.rs
index 1cb4e917c081..448bf4752581 100644
--- a/pageserver/src/consumption_metrics/upload.rs
+++ b/pageserver/src/consumption_metrics/upload.rs
@@ -1,3 +1,4 @@
+use std::error::Error as _;
 use std::time::SystemTime;
 
 use chrono::{DateTime, Utc};
@@ -350,7 +351,11 @@ impl std::fmt::Display for UploadError {
 
         match self {
             Rejected(code) => write!(f, "server rejected the metrics with {code}"),
-            Reqwest(e) => write!(f, "request failed: {e}"),
+            Reqwest(e) => write!(
+                f,
+                "request failed: {e}{}",
+                e.source().map(|e| format!(": {e}")).unwrap_or_default()
+            ),
             Cancelled => write!(f, "cancelled"),
         }
     }
diff --git a/safekeeper/src/http/client.rs b/safekeeper/src/http/client.rs
index c56f7880d4f8..a166fc1ab9b0 100644
--- a/safekeeper/src/http/client.rs
+++ b/safekeeper/src/http/client.rs
@@ -8,6 +8,7 @@
 //! etc.
 
 use reqwest::{IntoUrl, Method, StatusCode};
+use std::error::Error as _;
 use utils::{
     http::error::HttpErrorBody,
     id::{NodeId, TenantId, TimelineId},
@@ -26,7 +27,7 @@ pub struct Client {
 #[derive(thiserror::Error, Debug)]
 pub enum Error {
     /// Failed to receive body (reqwest error).
-    #[error("receive body: {0}")]
+    #[error("receive body: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())]
     ReceiveBody(reqwest::Error),
 
     /// Status is not ok, but failed to parse body as `HttpErrorBody`.
diff --git a/storage_controller/src/compute_hook.rs b/storage_controller/src/compute_hook.rs
index b63a322b879f..2b2ece3f0271 100644
--- a/storage_controller/src/compute_hook.rs
+++ b/storage_controller/src/compute_hook.rs
@@ -1,3 +1,4 @@
+use std::error::Error as _;
 use std::sync::Arc;
 use std::{collections::HashMap, time::Duration};
 
@@ -172,7 +173,7 @@ struct ComputeHookNotifyRequest {
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum NotifyError {
     // Request was not send successfully, e.g. transport error
-    #[error("Sending request: {0}")]
+    #[error("Sending request: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())]
     Request(#[from] reqwest::Error),
     // Request could not be serviced right now due to ongoing Operation in control plane, but should be possible soon.
     #[error("Control plane tenant busy")]
diff --git a/storage_controller/src/peer_client.rs b/storage_controller/src/peer_client.rs
index 3f8520fe557c..ee4eb55294d1 100644
--- a/storage_controller/src/peer_client.rs
+++ b/storage_controller/src/peer_client.rs
@@ -1,7 +1,9 @@
 use crate::tenant_shard::ObservedState;
 use pageserver_api::shard::TenantShardId;
 use serde::{Deserialize, Serialize};
-use std::{collections::HashMap, time::Duration};
+use std::collections::HashMap;
+use std::error::Error as _;
+use std::time::Duration;
 use tokio_util::sync::CancellationToken;
 
 use hyper::Uri;
@@ -17,11 +19,14 @@ pub(crate) struct PeerClient {
 
 #[derive(thiserror::Error, Debug)]
 pub(crate) enum StorageControllerPeerError {
-    #[error("failed to deserialize error response with status code {0} at {1}: {2}")]
+    #[error(
+        "failed to deserialize error response with status code {0} at {1}: {2}{}",
+        .2.source().map(|e| format!(": {e}")).unwrap_or_default()
+    )]
     DeserializationError(StatusCode, Url, reqwest::Error),
     #[error("storage controller peer API error ({0}): {1}")]
     ApiError(StatusCode, String),
-    #[error("failed to send HTTP request: {0}")]
+    #[error("failed to send HTTP request: {0}{}", .0.source().map(|e| format!(": {e}")).unwrap_or_default())]
     SendError(reqwest::Error),
     #[error("Cancelled")]
     Cancelled,
diff --git a/storage_scrubber/src/cloud_admin_api.rs b/storage_scrubber/src/cloud_admin_api.rs
index c9a62cd256d7..b1dfe3a53f28 100644
--- a/storage_scrubber/src/cloud_admin_api.rs
+++ b/storage_scrubber/src/cloud_admin_api.rs
@@ -1,3 +1,5 @@
+use std::error::Error as _;
+
 use chrono::{DateTime, Utc};
 use futures::Future;
 use hex::FromHex;
@@ -30,14 +32,18 @@ impl std::fmt::Display for Error {
         match &self.kind {
             ErrorKind::RequestSend(e) => write!(
                 f,
-                "Failed to send a request. Context: {}, error: {}",
-                self.context, e
+                "Failed to send a request. Context: {}, error: {}{}",
+                self.context,
+                e,
+                e.source().map(|e| format!(": {e}")).unwrap_or_default()
             ),
             ErrorKind::BodyRead(e) => {
                 write!(
                     f,
-                    "Failed to read a request body. Context: {}, error: {}",
-                    self.context, e
+                    "Failed to read a request body. Context: {}, error: {}{}",
+                    self.context,
+                    e,
+                    e.source().map(|e| format!(": {e}")).unwrap_or_default()
                 )
             }
             ErrorKind::ResponseStatus(status) => {

From dec2e2fb2997225be9f99687a8829a1e9c473313 Mon Sep 17 00:00:00 2001
From: a-masterov <72613290+a-masterov@users.noreply.github.com>
Date: Wed, 4 Dec 2024 14:10:00 +0100
Subject: [PATCH 38/65] Create a branch for compute release (#9637)

## Problem
We practice a manual release flow for the compute module. This will
allow automation of the compute release process.

## Summary of changes
The workflow was modified to make a compute release automatically on the
branch release-compute.
## Checklist before requesting a review

- [x] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist
---
 .../actions/allure-report-generate/action.yml |  3 ++-
 .../actions/allure-report-store/action.yml    |  3 ++-
 .github/workflows/_create-release-pr.yml      |  2 +-
 .github/workflows/build_and_test.yml          | 23 +++++++++++--------
 .github/workflows/release.yml                 | 23 ++++++++++++++++---
 .github/workflows/trigger-e2e-tests.yml       |  2 ++
 6 files changed, 41 insertions(+), 15 deletions(-)

diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml
index d1d09223dbff..d6219c31b4a5 100644
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -43,7 +43,8 @@ runs:
         PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
         if [ "${PR_NUMBER}" != "null" ]; then
           BRANCH_OR_PR=pr-${PR_NUMBER}
-        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then
+        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || \
+             [ "${GITHUB_REF_NAME}" = "release-proxy" ] || [ "${GITHUB_REF_NAME}" = "release-compute" ]; then
           # Shortcut for special branches
           BRANCH_OR_PR=${GITHUB_REF_NAME}
         else
diff --git a/.github/actions/allure-report-store/action.yml b/.github/actions/allure-report-store/action.yml
index 9c376f420ad7..3c83656c8940 100644
--- a/.github/actions/allure-report-store/action.yml
+++ b/.github/actions/allure-report-store/action.yml
@@ -23,7 +23,8 @@ runs:
         PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
         if [ "${PR_NUMBER}" != "null" ]; then
           BRANCH_OR_PR=pr-${PR_NUMBER}
-        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || [ "${GITHUB_REF_NAME}" = "release-proxy" ]; then
+        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ] || \
+             [ "${GITHUB_REF_NAME}" = "release-proxy" ] || [ "${GITHUB_REF_NAME}" = "release-compute" ]; then
           # Shortcut for special branches
           BRANCH_OR_PR=${GITHUB_REF_NAME}
         else
diff --git a/.github/workflows/_create-release-pr.yml b/.github/workflows/_create-release-pr.yml
index cc6994397f8d..3c130c822945 100644
--- a/.github/workflows/_create-release-pr.yml
+++ b/.github/workflows/_create-release-pr.yml
@@ -21,7 +21,7 @@ defaults:
     shell: bash -euo pipefail {0}
 
 jobs:
-  create-storage-release-branch:
+  create-release-branch:
     runs-on: ubuntu-22.04
 
     permissions:
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index e9e111e7bdae..cb966f292ee5 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -6,6 +6,7 @@ on:
       - main
       - release
       - release-proxy
+      - release-compute
   pull_request:
 
 defaults:
@@ -70,8 +71,10 @@ jobs:
             echo "tag=release-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
           elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
             echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then
+            echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
           else
-            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release', 'release-proxy', 'release-compute'"
             echo "tag=$GITHUB_RUN_ID" >> $GITHUB_OUTPUT
           fi
         shell: bash
@@ -513,7 +516,7 @@ jobs:
             })
 
   trigger-e2e-tests:
-    if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' }}
+    if: ${{ !github.event.pull_request.draft || contains( github.event.pull_request.labels.*.name, 'run-e2e-tests-in-draft') || github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute' }}
     needs: [ check-permissions, promote-images, tag ]
     uses: ./.github/workflows/trigger-e2e-tests.yml
     secrets: inherit
@@ -934,7 +937,7 @@ jobs:
                                               neondatabase/neon-test-extensions-v16:${{ needs.tag.outputs.build-tag }}
 
       - name: Configure AWS-prod credentials
-        if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+        if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
         uses: aws-actions/configure-aws-credentials@v4
         with:
           aws-region: eu-central-1
@@ -943,12 +946,12 @@ jobs:
 
       - name: Login to prod ECR
         uses: docker/login-action@v3
-        if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+        if: github.ref_name == 'release'|| github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
         with:
           registry: 093970136003.dkr.ecr.eu-central-1.amazonaws.com
 
       - name: Copy all images to prod ECR
-        if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+        if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
         run: |
           for image in neon compute-tools {vm-,}compute-node-{v14,v15,v16,v17}; do
             docker buildx imagetools create -t 093970136003.dkr.ecr.eu-central-1.amazonaws.com/${image}:${{ needs.tag.outputs.build-tag }} \
@@ -968,7 +971,7 @@ jobs:
       tenant_id: ${{ vars.AZURE_TENANT_ID }}
 
   push-to-acr-prod:
-    if: github.ref_name == 'release'|| github.ref_name == 'release-proxy'
+    if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
     needs: [ tag, promote-images ]
     uses: ./.github/workflows/_push-to-acr.yml
     with:
@@ -1056,7 +1059,7 @@ jobs:
   deploy:
     needs: [ check-permissions, promote-images, tag, build-and-test-locally, trigger-custom-extensions-build-and-wait, push-to-acr-dev, push-to-acr-prod ]
     # `!failure() && !cancelled()` is required because the workflow depends on the job that can be skipped: `push-to-acr-dev` and `push-to-acr-prod`
-    if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy') && !failure() && !cancelled()
+    if: (github.ref_name == 'main' || github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute') && !failure() && !cancelled()
 
     runs-on: [ self-hosted, small ]
     container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
@@ -1105,13 +1108,15 @@ jobs:
               -f deployProxyAuthBroker=true \
               -f branch=main \
               -f dockerTag=${{needs.tag.outputs.build-tag}}
+          elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then
+            gh workflow --repo neondatabase/infra run deploy-compute-dev.yml --ref main -f dockerTag=${{needs.tag.outputs.build-tag}}
           else
-            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
+            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main', 'release', 'release-proxy' or 'release-compute'"
             exit 1
           fi
 
       - name: Create git tag
-        if: github.ref_name == 'release' || github.ref_name == 'release-proxy'
+        if: github.ref_name == 'release' || github.ref_name == 'release-proxy' || github.ref_name == 'release-compute'
         uses: actions/github-script@v7
         with:
           # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml
index 11f010b6d4f6..f0273b977f0e 100644
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -15,6 +15,10 @@ on:
         type: boolean
         description: 'Create Proxy release PR'
         required: false
+      create-compute-release-branch:
+        type: boolean
+        description: 'Create Compute release PR'
+        required: false
 
 # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}
@@ -25,20 +29,20 @@ defaults:
 
 jobs:
   create-storage-release-branch:
-    if: ${{ github.event.schedule == '0 6 * * MON' || format('{0}', inputs.create-storage-release-branch) == 'true' }}
+    if: ${{ github.event.schedule == '0 6 * * MON' || inputs.create-storage-release-branch }}
 
     permissions:
       contents: write
 
     uses: ./.github/workflows/_create-release-pr.yml
     with:
-      component-name: 'Storage & Compute'
+      component-name: 'Storage'
       release-branch: 'release'
     secrets:
       ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
 
   create-proxy-release-branch:
-    if: ${{ github.event.schedule == '0 6 * * THU' || format('{0}', inputs.create-proxy-release-branch) == 'true' }}
+    if: ${{ github.event.schedule == '0 6 * * THU' || inputs.create-proxy-release-branch }}
 
     permissions:
       contents: write
@@ -49,3 +53,16 @@ jobs:
       release-branch: 'release-proxy'
     secrets:
       ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
+
+  create-compute-release-branch:
+    if: inputs.create-compute-release-branch
+
+    permissions:
+      contents: write
+
+    uses: ./.github/workflows/_create-release-pr.yml
+    with:
+      component-name: 'Compute'
+      release-branch: 'release-compute'
+    secrets:
+      ci-access-token: ${{ secrets.CI_ACCESS_TOKEN }}
diff --git a/.github/workflows/trigger-e2e-tests.yml b/.github/workflows/trigger-e2e-tests.yml
index 1e7264c55a87..70c2e8549f3e 100644
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -51,6 +51,8 @@ jobs:
             echo "tag=release-$(git rev-list --count HEAD)" | tee -a $GITHUB_OUTPUT
           elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
             echo "tag=release-proxy-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
+          elif [[ "$GITHUB_REF_NAME" == "release-compute" ]]; then
+            echo "tag=release-compute-$(git rev-list --count HEAD)" >> $GITHUB_OUTPUT
           else
             echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
             BUILD_AND_TEST_RUN_ID=$(gh run list -b $CURRENT_BRANCH -c $CURRENT_SHA -w 'Build and Test' -L 1 --json databaseId --jq '.[].databaseId')

From 60c0d19f57c160be46ae364e139a2063d7741522 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Wed, 4 Dec 2024 15:04:04 +0000
Subject: [PATCH 39/65] tests: make storcon scale test AZ-aware (#9952)

## Problem

We have a scale test for the storage controller which also acts as a
good stress test for scheduling stability. However, it created nodes
with no AZs set.

## Summary of changes

- Bump node count to 6 and set AZs on them.

This is a precursor to other AZ-related PRs, to make sure any new code
that's landed is getting scale tested in an AZ-aware environment.
---
 test_runner/performance/test_storage_controller_scale.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/test_runner/performance/test_storage_controller_scale.py b/test_runner/performance/test_storage_controller_scale.py
index 142bd3d669ae..49f41483ec7d 100644
--- a/test_runner/performance/test_storage_controller_scale.py
+++ b/test_runner/performance/test_storage_controller_scale.py
@@ -72,7 +72,7 @@ def test_storage_controller_many_tenants(
     we don't fall over for a thousand shards.
     """
 
-    neon_env_builder.num_pageservers = 5
+    neon_env_builder.num_pageservers = 6
     neon_env_builder.storage_controller_config = {
         # Default neon_local uses a small timeout: use a longer one to tolerate longer pageserver restarts.
         # TODO: tune this down as restarts get faster (https://github.com/neondatabase/neon/pull/7553), to
@@ -84,6 +84,11 @@ def test_storage_controller_many_tenants(
         compute_reconfigure_listener.control_plane_compute_hook_api
     )
 
+    AZS = ["alpha", "bravo", "charlie"]
+    neon_env_builder.pageserver_config_override = lambda ps_cfg: ps_cfg.update(
+        {"availability_zone": f"az-{AZS[ps_cfg['id'] % len(AZS)]}"}
+    )
+
     # A small sleep on each call into the notify hook, to simulate the latency of doing a database write
     compute_reconfigure_listener.register_on_notify(lambda body: time.sleep(0.01))
 

From e6cd5050fcf9f699275b7adb7509efac0e3cd1b5 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Wed, 4 Dec 2024 11:54:56 -0500
Subject: [PATCH 40/65] pageserver: make `BufferedWriter` do double-buffering
 (#9693)

Closes #9387.

## Problem

`BufferedWriter` cannot proceed while the owned buffer is flushing to
disk. We want to implement double buffering so that the flush can happen
in the background. See #9387.

## Summary of changes

- Maintain two owned buffers in `BufferedWriter`.
- The writer is in charge of copying the data into owned, aligned
buffer, once full, submit it to the flush task.
- The flush background task is in charge of flushing the owned buffer to
disk, and returned the buffer to the writer for reuse.
- The writer and the flush background task communicate through a
bi-directional channel.

For in-memory layer, we also need to be able to read from the buffered
writer in `get_values_reconstruct_data`. To handle this case, we did the
following
- Use replace `VirtualFile::write_all` with `VirtualFile::write_all_at`,
and use `Arc` to share it between writer and background task.
- leverage `IoBufferMut::freeze` to get a cheaply clonable `IoBuffer`,
one clone will be submitted to the channel, the other clone will be
saved within the writer to serve reads. When we want to reuse the
buffer, we can invoke `IoBuffer::into_mut`, which gives us back the
mutable aligned buffer.
- InMemoryLayer reads is now aware of the maybe_flushed part of the
buffer.

**Caveat**

- We removed the owned version of write, because this interface does not
work well with buffer alignment. The result is that without direct IO
enabled,
[`download_object`](https://github.com/neondatabase/neon/blob/a439d57050dafd603d24e001215213eb5246a029/pageserver/src/tenant/remote_timeline_client/download.rs#L243)
does one more memcpy than before this PR due to the switch to use
`_borrowed` version of the write.
- "Bypass aligned part of write" could be implemented later to avoid
large amount of memcpy.

**Testing**
- use an oneshot channel based control mechanism to make flush behavior
deterministic in test.
- test reading from `EphemeralFile` when the last submitted buffer is
not flushed, in-progress, and done flushing to disk.


## Performance


We see performance improvement for small values, and regression on big
values, likely due to being CPU bound + disk write latency.


[Results](https://www.notion.so/neondatabase/Benchmarking-New-BufferedWriter-11-20-2024-143f189e0047805ba99acda89f984d51?pvs=4)


## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 libs/utils/src/sync.rs                        |   1 +
 libs/utils/src/sync/duplex.rs                 |   1 +
 libs/utils/src/sync/duplex/mpsc.rs            |  36 ++
 pageserver/benches/bench_ingest.rs            |   4 +-
 pageserver/src/tenant/ephemeral_file.rs       | 286 +++++++++-----
 .../src/tenant/remote_timeline_client.rs      |   2 +
 .../tenant/remote_timeline_client/download.rs |  44 ++-
 pageserver/src/tenant/secondary/downloader.rs |   1 +
 .../tenant/storage_layer/inmemory_layer.rs    |   5 +-
 pageserver/src/tenant/storage_layer/layer.rs  |   1 +
 pageserver/src/tenant/timeline.rs             |   3 +-
 .../src/tenant/timeline/layer_manager.rs      |  14 +-
 pageserver/src/virtual_file.rs                |  26 +-
 .../aligned_buffer/alignment.rs               |   4 +-
 .../owned_buffers_io/aligned_buffer/buffer.rs |  18 +-
 .../aligned_buffer/buffer_mut.rs              |  43 ++-
 .../owned_buffers_io/io_buf_aligned.rs        |  10 +-
 .../owned_buffers_io/io_buf_ext.rs            |  14 +
 .../util/size_tracking_writer.rs              |  50 ---
 .../virtual_file/owned_buffers_io/write.rs    | 358 +++++++++---------
 .../owned_buffers_io/write/flush.rs           | 314 +++++++++++++++
 21 files changed, 846 insertions(+), 389 deletions(-)
 create mode 100644 libs/utils/src/sync/duplex.rs
 create mode 100644 libs/utils/src/sync/duplex/mpsc.rs
 delete mode 100644 pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
 create mode 100644 pageserver/src/virtual_file/owned_buffers_io/write/flush.rs

diff --git a/libs/utils/src/sync.rs b/libs/utils/src/sync.rs
index 7aa26e24bcc4..280637de8feb 100644
--- a/libs/utils/src/sync.rs
+++ b/libs/utils/src/sync.rs
@@ -1,5 +1,6 @@
 pub mod heavier_once_cell;
 
+pub mod duplex;
 pub mod gate;
 
 pub mod spsc_fold;
diff --git a/libs/utils/src/sync/duplex.rs b/libs/utils/src/sync/duplex.rs
new file mode 100644
index 000000000000..fac79297a086
--- /dev/null
+++ b/libs/utils/src/sync/duplex.rs
@@ -0,0 +1 @@
+pub mod mpsc;
diff --git a/libs/utils/src/sync/duplex/mpsc.rs b/libs/utils/src/sync/duplex/mpsc.rs
new file mode 100644
index 000000000000..56b4e6d2b331
--- /dev/null
+++ b/libs/utils/src/sync/duplex/mpsc.rs
@@ -0,0 +1,36 @@
+use tokio::sync::mpsc;
+
+/// A bi-directional channel.
+pub struct Duplex<S, R> {
+    pub tx: mpsc::Sender<S>,
+    pub rx: mpsc::Receiver<R>,
+}
+
+/// Creates a bi-directional channel.
+///
+/// The channel will buffer up to the provided number of messages. Once the buffer is full,
+/// attempts to send new messages will wait until a message is received from the channel.
+/// The provided buffer capacity must be at least 1.
+pub fn channel<A: Send, B: Send>(buffer: usize) -> (Duplex<A, B>, Duplex<B, A>) {
+    let (tx_a, rx_a) = mpsc::channel::<A>(buffer);
+    let (tx_b, rx_b) = mpsc::channel::<B>(buffer);
+
+    (Duplex { tx: tx_a, rx: rx_b }, Duplex { tx: tx_b, rx: rx_a })
+}
+
+impl<S: Send, R: Send> Duplex<S, R> {
+    /// Sends a value, waiting until there is capacity.
+    ///
+    /// A successful send occurs when it is determined that the other end of the channel has not hung up already.
+    pub async fn send(&self, x: S) -> Result<(), mpsc::error::SendError<S>> {
+        self.tx.send(x).await
+    }
+
+    /// Receives the next value for this receiver.
+    ///
+    /// This method returns `None` if the channel has been closed and there are
+    /// no remaining messages in the channel's buffer.
+    pub async fn recv(&mut self) -> Option<R> {
+        self.rx.recv().await
+    }
+}
diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
index caacd365b306..b67a9cc47951 100644
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -62,10 +62,8 @@ async fn ingest(
     let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
 
     let gate = utils::sync::gate::Gate::default();
-    let entered = gate.enter().unwrap();
 
-    let layer =
-        InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, entered, &ctx).await?;
+    let layer = InMemoryLayer::create(conf, timeline_id, tenant_shard_id, lsn, &gate, &ctx).await?;
 
     let data = Value::Image(Bytes::from(vec![0u8; put_size]));
     let data_ser_size = data.serialized_size().unwrap() as usize;
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index de0abab4c0c7..aaec8a4c313a 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -8,10 +8,8 @@ use crate::page_cache;
 use crate::tenant::storage_layer::inmemory_layer::vectored_dio_read::File;
 use crate::virtual_file::owned_buffers_io::io_buf_aligned::IoBufAlignedMut;
 use crate::virtual_file::owned_buffers_io::slice::SliceMutExt;
-use crate::virtual_file::owned_buffers_io::util::size_tracking_writer;
 use crate::virtual_file::owned_buffers_io::write::Buffer;
 use crate::virtual_file::{self, owned_buffers_io, IoBufferMut, VirtualFile};
-use bytes::BytesMut;
 use camino::Utf8PathBuf;
 use num_traits::Num;
 use pageserver_api::shard::TenantShardId;
@@ -20,6 +18,7 @@ use tracing::error;
 
 use std::io;
 use std::sync::atomic::AtomicU64;
+use std::sync::Arc;
 use utils::id::TimelineId;
 
 pub struct EphemeralFile {
@@ -27,10 +26,7 @@ pub struct EphemeralFile {
     _timeline_id: TimelineId,
     page_cache_file_id: page_cache::FileId,
     bytes_written: u64,
-    buffered_writer: owned_buffers_io::write::BufferedWriter<
-        BytesMut,
-        size_tracking_writer::Writer<VirtualFile>,
-    >,
+    buffered_writer: owned_buffers_io::write::BufferedWriter<IoBufferMut, VirtualFile>,
     /// Gate guard is held on as long as we need to do operations in the path (delete on drop)
     _gate_guard: utils::sync::gate::GateGuard,
 }
@@ -42,9 +38,9 @@ impl EphemeralFile {
         conf: &PageServerConf,
         tenant_shard_id: TenantShardId,
         timeline_id: TimelineId,
-        gate_guard: utils::sync::gate::GateGuard,
+        gate: &utils::sync::gate::Gate,
         ctx: &RequestContext,
-    ) -> Result<EphemeralFile, io::Error> {
+    ) -> anyhow::Result<EphemeralFile> {
         static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
         let filename_disambiguator =
             NEXT_FILENAME.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
@@ -55,15 +51,17 @@ impl EphemeralFile {
                 "ephemeral-{filename_disambiguator}"
             )));
 
-        let file = VirtualFile::open_with_options(
-            &filename,
-            virtual_file::OpenOptions::new()
-                .read(true)
-                .write(true)
-                .create(true),
-            ctx,
-        )
-        .await?;
+        let file = Arc::new(
+            VirtualFile::open_with_options_v2(
+                &filename,
+                virtual_file::OpenOptions::new()
+                    .read(true)
+                    .write(true)
+                    .create(true),
+                ctx,
+            )
+            .await?,
+        );
 
         let page_cache_file_id = page_cache::next_file_id(); // XXX get rid, we're not page-caching anymore
 
@@ -73,10 +71,12 @@ impl EphemeralFile {
             page_cache_file_id,
             bytes_written: 0,
             buffered_writer: owned_buffers_io::write::BufferedWriter::new(
-                size_tracking_writer::Writer::new(file),
-                BytesMut::with_capacity(TAIL_SZ),
+                file,
+                || IoBufferMut::with_capacity(TAIL_SZ),
+                gate.enter()?,
+                ctx,
             ),
-            _gate_guard: gate_guard,
+            _gate_guard: gate.enter()?,
         })
     }
 }
@@ -85,7 +85,7 @@ impl Drop for EphemeralFile {
     fn drop(&mut self) {
         // unlink the file
         // we are clear to do this, because we have entered a gate
-        let path = self.buffered_writer.as_inner().as_inner().path();
+        let path = self.buffered_writer.as_inner().path();
         let res = std::fs::remove_file(path);
         if let Err(e) = res {
             if e.kind() != std::io::ErrorKind::NotFound {
@@ -132,6 +132,18 @@ impl EphemeralFile {
         srcbuf: &[u8],
         ctx: &RequestContext,
     ) -> std::io::Result<u64> {
+        let (pos, control) = self.write_raw_controlled(srcbuf, ctx).await?;
+        if let Some(control) = control {
+            control.release().await;
+        }
+        Ok(pos)
+    }
+
+    async fn write_raw_controlled(
+        &mut self,
+        srcbuf: &[u8],
+        ctx: &RequestContext,
+    ) -> std::io::Result<(u64, Option<owned_buffers_io::write::FlushControl>)> {
         let pos = self.bytes_written;
 
         let new_bytes_written = pos.checked_add(srcbuf.len().into_u64()).ok_or_else(|| {
@@ -145,9 +157,9 @@ impl EphemeralFile {
         })?;
 
         // Write the payload
-        let nwritten = self
+        let (nwritten, control) = self
             .buffered_writer
-            .write_buffered_borrowed(srcbuf, ctx)
+            .write_buffered_borrowed_controlled(srcbuf, ctx)
             .await?;
         assert_eq!(
             nwritten,
@@ -157,7 +169,7 @@ impl EphemeralFile {
 
         self.bytes_written = new_bytes_written;
 
-        Ok(pos)
+        Ok((pos, control))
     }
 }
 
@@ -168,11 +180,12 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral
         dst: tokio_epoll_uring::Slice<B>,
         ctx: &'a RequestContext,
     ) -> std::io::Result<(tokio_epoll_uring::Slice<B>, usize)> {
-        let file_size_tracking_writer = self.buffered_writer.as_inner();
-        let flushed_offset = file_size_tracking_writer.bytes_written();
+        let submitted_offset = self.buffered_writer.bytes_submitted();
+
+        let mutable = self.buffered_writer.inspect_mutable();
+        let mutable = &mutable[0..mutable.pending()];
 
-        let buffer = self.buffered_writer.inspect_buffer();
-        let buffered = &buffer[0..buffer.pending()];
+        let maybe_flushed = self.buffered_writer.inspect_maybe_flushed();
 
         let dst_cap = dst.bytes_total().into_u64();
         let end = {
@@ -197,11 +210,42 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral
                 }
             }
         }
-        let written_range = Range(start, std::cmp::min(end, flushed_offset));
-        let buffered_range = Range(std::cmp::max(start, flushed_offset), end);
+
+        let (written_range, maybe_flushed_range) = {
+            if maybe_flushed.is_some() {
+                // [       written       ][ maybe_flushed ][    mutable    ]
+                //                        <-   TAIL_SZ   -><-   TAIL_SZ   ->
+                //                                         ^
+                //                                 `submitted_offset`
+                // <++++++ on disk +++++++????????????????>
+                (
+                    Range(
+                        start,
+                        std::cmp::min(end, submitted_offset.saturating_sub(TAIL_SZ as u64)),
+                    ),
+                    Range(
+                        std::cmp::max(start, submitted_offset.saturating_sub(TAIL_SZ as u64)),
+                        std::cmp::min(end, submitted_offset),
+                    ),
+                )
+            } else {
+                // [       written                        ][    mutable    ]
+                //                                         <-   TAIL_SZ   ->
+                //                                         ^
+                //                                 `submitted_offset`
+                // <++++++ on disk +++++++++++++++++++++++>
+                (
+                    Range(start, std::cmp::min(end, submitted_offset)),
+                    // zero len
+                    Range(submitted_offset, u64::MIN),
+                )
+            }
+        };
+
+        let mutable_range = Range(std::cmp::max(start, submitted_offset), end);
 
         let dst = if written_range.len() > 0 {
-            let file: &VirtualFile = file_size_tracking_writer.as_inner();
+            let file: &VirtualFile = self.buffered_writer.as_inner();
             let bounds = dst.bounds();
             let slice = file
                 .read_exact_at(dst.slice(0..written_range.len().into_usize()), start, ctx)
@@ -211,19 +255,21 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral
             dst
         };
 
-        let dst = if buffered_range.len() > 0 {
-            let offset_in_buffer = buffered_range
+        let dst = if maybe_flushed_range.len() > 0 {
+            let offset_in_buffer = maybe_flushed_range
                 .0
-                .checked_sub(flushed_offset)
+                .checked_sub(submitted_offset.saturating_sub(TAIL_SZ as u64))
                 .unwrap()
                 .into_usize();
-            let to_copy =
-                &buffered[offset_in_buffer..(offset_in_buffer + buffered_range.len().into_usize())];
+            // Checked previously the buffer is Some.
+            let maybe_flushed = maybe_flushed.unwrap();
+            let to_copy = &maybe_flushed
+                [offset_in_buffer..(offset_in_buffer + maybe_flushed_range.len().into_usize())];
             let bounds = dst.bounds();
             let mut view = dst.slice({
                 let start = written_range.len().into_usize();
                 let end = start
-                    .checked_add(buffered_range.len().into_usize())
+                    .checked_add(maybe_flushed_range.len().into_usize())
                     .unwrap();
                 start..end
             });
@@ -234,6 +280,28 @@ impl super::storage_layer::inmemory_layer::vectored_dio_read::File for Ephemeral
             dst
         };
 
+        let dst = if mutable_range.len() > 0 {
+            let offset_in_buffer = mutable_range
+                .0
+                .checked_sub(submitted_offset)
+                .unwrap()
+                .into_usize();
+            let to_copy =
+                &mutable[offset_in_buffer..(offset_in_buffer + mutable_range.len().into_usize())];
+            let bounds = dst.bounds();
+            let mut view = dst.slice({
+                let start =
+                    written_range.len().into_usize() + maybe_flushed_range.len().into_usize();
+                let end = start.checked_add(mutable_range.len().into_usize()).unwrap();
+                start..end
+            });
+            view.as_mut_rust_slice_full_zeroed()
+                .copy_from_slice(to_copy);
+            Slice::from_buf_bounds(Slice::into_inner(view), bounds)
+        } else {
+            dst
+        };
+
         // TODO: in debug mode, randomize the remaining bytes in `dst` to catch bugs
 
         Ok((dst, (end - start).into_usize()))
@@ -295,7 +363,7 @@ mod tests {
 
         let gate = utils::sync::gate::Gate::default();
 
-        let file = EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
+        let file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx)
             .await
             .unwrap();
 
@@ -326,14 +394,15 @@ mod tests {
 
         let gate = utils::sync::gate::Gate::default();
 
-        let mut file =
-            EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
-                .await
-                .unwrap();
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx)
+            .await
+            .unwrap();
 
-        let cap = file.buffered_writer.inspect_buffer().capacity();
+        let mutable = file.buffered_writer.inspect_mutable();
+        let cap = mutable.capacity();
+        let align = mutable.align();
 
-        let write_nbytes = cap + cap / 2;
+        let write_nbytes = cap * 2 + cap / 2;
 
         let content: Vec<u8> = rand::thread_rng()
             .sample_iter(rand::distributions::Standard)
@@ -341,30 +410,39 @@ mod tests {
             .collect();
 
         let mut value_offsets = Vec::new();
-        for i in 0..write_nbytes {
-            let off = file.write_raw(&content[i..i + 1], &ctx).await.unwrap();
+        for range in (0..write_nbytes)
+            .step_by(align)
+            .map(|start| start..(start + align).min(write_nbytes))
+        {
+            let off = file.write_raw(&content[range], &ctx).await.unwrap();
             value_offsets.push(off);
         }
 
-        assert!(file.len() as usize == write_nbytes);
-        for i in 0..write_nbytes {
-            assert_eq!(value_offsets[i], i.into_u64());
-            let buf = IoBufferMut::with_capacity(1);
+        assert_eq!(file.len() as usize, write_nbytes);
+        for (i, range) in (0..write_nbytes)
+            .step_by(align)
+            .map(|start| start..(start + align).min(write_nbytes))
+            .enumerate()
+        {
+            assert_eq!(value_offsets[i], range.start.into_u64());
+            let buf = IoBufferMut::with_capacity(range.len());
             let (buf_slice, nread) = file
-                .read_exact_at_eof_ok(i.into_u64(), buf.slice_full(), &ctx)
+                .read_exact_at_eof_ok(range.start.into_u64(), buf.slice_full(), &ctx)
                 .await
                 .unwrap();
             let buf = buf_slice.into_inner();
-            assert_eq!(nread, 1);
-            assert_eq!(&buf, &content[i..i + 1]);
+            assert_eq!(nread, range.len());
+            assert_eq!(&buf, &content[range]);
         }
 
-        let file_contents =
-            std::fs::read(file.buffered_writer.as_inner().as_inner().path()).unwrap();
-        assert_eq!(file_contents, &content[0..cap]);
+        let file_contents = std::fs::read(file.buffered_writer.as_inner().path()).unwrap();
+        assert!(file_contents == content[0..cap * 2]);
+
+        let maybe_flushed_buffer_contents = file.buffered_writer.inspect_maybe_flushed().unwrap();
+        assert_eq!(&maybe_flushed_buffer_contents[..], &content[cap..cap * 2]);
 
-        let buffer_contents = file.buffered_writer.inspect_buffer();
-        assert_eq!(buffer_contents, &content[cap..write_nbytes]);
+        let mutable_buffer_contents = file.buffered_writer.inspect_mutable();
+        assert_eq!(mutable_buffer_contents, &content[cap * 2..write_nbytes]);
     }
 
     #[tokio::test]
@@ -373,16 +451,16 @@ mod tests {
 
         let gate = utils::sync::gate::Gate::default();
 
-        let mut file =
-            EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
-                .await
-                .unwrap();
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx)
+            .await
+            .unwrap();
 
-        let cap = file.buffered_writer.inspect_buffer().capacity();
+        // mutable buffer and maybe_flushed buffer each has `cap` bytes.
+        let cap = file.buffered_writer.inspect_mutable().capacity();
 
         let content: Vec<u8> = rand::thread_rng()
             .sample_iter(rand::distributions::Standard)
-            .take(cap + cap / 2)
+            .take(cap * 2 + cap / 2)
             .collect();
 
         file.write_raw(&content, &ctx).await.unwrap();
@@ -390,23 +468,21 @@ mod tests {
         // assert the state is as this test expects it to be
         assert_eq!(
             &file.load_to_io_buf(&ctx).await.unwrap(),
-            &content[0..cap + cap / 2]
+            &content[0..cap * 2 + cap / 2]
         );
-        let md = file
-            .buffered_writer
-            .as_inner()
-            .as_inner()
-            .path()
-            .metadata()
-            .unwrap();
+        let md = file.buffered_writer.as_inner().path().metadata().unwrap();
         assert_eq!(
             md.len(),
-            cap.into_u64(),
-            "buffered writer does one write if we write 1.5x buffer capacity"
+            2 * cap.into_u64(),
+            "buffered writer requires one write to be flushed if we write 2.5x buffer capacity"
+        );
+        assert_eq!(
+            &file.buffered_writer.inspect_maybe_flushed().unwrap()[0..cap],
+            &content[cap..cap * 2]
         );
         assert_eq!(
-            &file.buffered_writer.inspect_buffer()[0..cap / 2],
-            &content[cap..cap + cap / 2]
+            &file.buffered_writer.inspect_mutable()[0..cap / 2],
+            &content[cap * 2..cap * 2 + cap / 2]
         );
     }
 
@@ -422,19 +498,19 @@ mod tests {
 
         let gate = utils::sync::gate::Gate::default();
 
-        let mut file =
-            EphemeralFile::create(conf, tenant_id, timeline_id, gate.enter().unwrap(), &ctx)
-                .await
-                .unwrap();
-
-        let cap = file.buffered_writer.inspect_buffer().capacity();
+        let mut file = EphemeralFile::create(conf, tenant_id, timeline_id, &gate, &ctx)
+            .await
+            .unwrap();
 
+        let mutable = file.buffered_writer.inspect_mutable();
+        let cap = mutable.capacity();
+        let align = mutable.align();
         let content: Vec<u8> = rand::thread_rng()
             .sample_iter(rand::distributions::Standard)
-            .take(cap + cap / 2)
+            .take(cap * 2 + cap / 2)
             .collect();
 
-        file.write_raw(&content, &ctx).await.unwrap();
+        let (_, control) = file.write_raw_controlled(&content, &ctx).await.unwrap();
 
         let test_read = |start: usize, len: usize| {
             let file = &file;
@@ -454,16 +530,38 @@ mod tests {
             }
         };
 
+        let test_read_all_offset_combinations = || {
+            async move {
+                test_read(align, align).await;
+                // border onto edge of file
+                test_read(cap - align, align).await;
+                // read across file and buffer
+                test_read(cap - align, 2 * align).await;
+                // stay from start of maybe flushed buffer
+                test_read(cap, align).await;
+                // completely within maybe flushed buffer
+                test_read(cap + align, align).await;
+                // border onto edge of maybe flushed buffer.
+                test_read(cap * 2 - align, align).await;
+                // read across maybe flushed and mutable buffer
+                test_read(cap * 2 - align, 2 * align).await;
+                // read across three segments
+                test_read(cap - align, cap + 2 * align).await;
+                // completely within mutable buffer
+                test_read(cap * 2 + align, align).await;
+            }
+        };
+
         // completely within the file range
-        assert!(20 < cap, "test assumption");
-        test_read(10, 10).await;
-        // border onto edge of file
-        test_read(cap - 10, 10).await;
-        // read across file and buffer
-        test_read(cap - 10, 20).await;
-        // stay from start of buffer
-        test_read(cap, 10).await;
-        // completely within buffer
-        test_read(cap + 10, 10).await;
+        assert!(align < cap, "test assumption");
+        assert!(cap % align == 0);
+
+        // test reads at different flush stages.
+        let not_started = control.unwrap().into_not_started();
+        test_read_all_offset_combinations().await;
+        let in_progress = not_started.ready_to_flush();
+        test_read_all_offset_combinations().await;
+        in_progress.wait_until_flush_is_done().await;
+        test_read_all_offset_combinations().await;
     }
 }
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 4bb1bbf3cfd5..89b935947d93 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -681,6 +681,7 @@ impl RemoteTimelineClient {
         layer_file_name: &LayerName,
         layer_metadata: &LayerFileMetadata,
         local_path: &Utf8Path,
+        gate: &utils::sync::gate::Gate,
         cancel: &CancellationToken,
         ctx: &RequestContext,
     ) -> Result<u64, DownloadError> {
@@ -700,6 +701,7 @@ impl RemoteTimelineClient {
                 layer_file_name,
                 layer_metadata,
                 local_path,
+                gate,
                 cancel,
                 ctx,
             )
diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index 739615be9cef..c5ae466f3afb 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -6,6 +6,7 @@
 use std::collections::HashSet;
 use std::future::Future;
 use std::str::FromStr;
+use std::sync::Arc;
 use std::time::SystemTime;
 
 use anyhow::{anyhow, Context};
@@ -26,9 +27,7 @@ use crate::span::{
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::Generation;
-#[cfg_attr(target_os = "macos", allow(unused_imports))]
-use crate::virtual_file::owned_buffers_io::io_buf_ext::IoBufExt;
-use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
+use crate::virtual_file::{on_fatal_io_error, IoBufferMut, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;
 use remote_storage::{
     DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath,
@@ -60,6 +59,7 @@ pub async fn download_layer_file<'a>(
     layer_file_name: &'a LayerName,
     layer_metadata: &'a LayerFileMetadata,
     local_path: &Utf8Path,
+    gate: &utils::sync::gate::Gate,
     cancel: &CancellationToken,
     ctx: &RequestContext,
 ) -> Result<u64, DownloadError> {
@@ -88,7 +88,9 @@ pub async fn download_layer_file<'a>(
     let temp_file_path = path_with_suffix_extension(local_path, TEMP_DOWNLOAD_EXTENSION);
 
     let bytes_amount = download_retry(
-        || async { download_object(storage, &remote_path, &temp_file_path, cancel, ctx).await },
+        || async {
+            download_object(storage, &remote_path, &temp_file_path, gate, cancel, ctx).await
+        },
         &format!("download {remote_path:?}"),
         cancel,
     )
@@ -148,6 +150,7 @@ async fn download_object<'a>(
     storage: &'a GenericRemoteStorage,
     src_path: &RemotePath,
     dst_path: &Utf8PathBuf,
+    gate: &utils::sync::gate::Gate,
     cancel: &CancellationToken,
     #[cfg_attr(target_os = "macos", allow(unused_variables))] ctx: &RequestContext,
 ) -> Result<u64, DownloadError> {
@@ -205,13 +208,16 @@ async fn download_object<'a>(
         }
         #[cfg(target_os = "linux")]
         crate::virtual_file::io_engine::IoEngine::TokioEpollUring => {
-            use crate::virtual_file::owned_buffers_io::{self, util::size_tracking_writer};
-            use bytes::BytesMut;
+            use crate::virtual_file::owned_buffers_io;
             async {
-                let destination_file = VirtualFile::create(dst_path, ctx)
-                    .await
-                    .with_context(|| format!("create a destination file for layer '{dst_path}'"))
-                    .map_err(DownloadError::Other)?;
+                let destination_file = Arc::new(
+                    VirtualFile::create(dst_path, ctx)
+                        .await
+                        .with_context(|| {
+                            format!("create a destination file for layer '{dst_path}'")
+                        })
+                        .map_err(DownloadError::Other)?,
+                );
 
                 let mut download = storage
                     .download(src_path, &DownloadOpts::default(), cancel)
@@ -219,14 +225,16 @@ async fn download_object<'a>(
 
                 pausable_failpoint!("before-downloading-layer-stream-pausable");
 
+                let mut buffered = owned_buffers_io::write::BufferedWriter::<IoBufferMut, _>::new(
+                    destination_file,
+                    || IoBufferMut::with_capacity(super::BUFFER_SIZE),
+                    gate.enter().map_err(|_| DownloadError::Cancelled)?,
+                    ctx,
+                );
+
                 // TODO: use vectored write (writev) once supported by tokio-epoll-uring.
                 // There's chunks_vectored() on the stream.
                 let (bytes_amount, destination_file) = async {
-                    let size_tracking = size_tracking_writer::Writer::new(destination_file);
-                    let mut buffered = owned_buffers_io::write::BufferedWriter::<BytesMut, _>::new(
-                        size_tracking,
-                        BytesMut::with_capacity(super::BUFFER_SIZE),
-                    );
                     while let Some(res) =
                         futures::StreamExt::next(&mut download.download_stream).await
                     {
@@ -234,10 +242,10 @@ async fn download_object<'a>(
                             Ok(chunk) => chunk,
                             Err(e) => return Err(e),
                         };
-                        buffered.write_buffered(chunk.slice_len(), ctx).await?;
+                        buffered.write_buffered_borrowed(&chunk, ctx).await?;
                     }
-                    let size_tracking = buffered.flush_and_into_inner(ctx).await?;
-                    Ok(size_tracking.into_inner())
+                    let inner = buffered.flush_and_into_inner(ctx).await?;
+                    Ok(inner)
                 }
                 .await?;
 
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 701e4cf04b49..395e34e404c4 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -1183,6 +1183,7 @@ impl<'a> TenantDownloader<'a> {
             &layer.name,
             &layer.metadata,
             &local_path,
+            &self.secondary_state.gate,
             &self.secondary_state.cancel,
             ctx,
         )
diff --git a/pageserver/src/tenant/storage_layer/inmemory_layer.rs b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
index af6112d53550..71e53da20f7f 100644
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -555,13 +555,12 @@ impl InMemoryLayer {
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
         start_lsn: Lsn,
-        gate_guard: utils::sync::gate::GateGuard,
+        gate: &utils::sync::gate::Gate,
         ctx: &RequestContext,
     ) -> Result<InMemoryLayer> {
         trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
 
-        let file =
-            EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate_guard, ctx).await?;
+        let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id, gate, ctx).await?;
         let key = InMemoryLayerFileId(file.page_cache_file_id());
 
         Ok(InMemoryLayer {
diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs
index a9f1189b4112..8933e8ceb13e 100644
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -1149,6 +1149,7 @@ impl LayerInner {
                 &self.desc.layer_name(),
                 &self.metadata(),
                 &self.path,
+                &timeline.gate,
                 &timeline.cancel,
                 ctx,
             )
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 1414bef0a5a2..fc741826ab1e 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3455,7 +3455,6 @@ impl Timeline {
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<InMemoryLayer>> {
         let mut guard = self.layers.write().await;
-        let gate_guard = self.gate.enter().context("enter gate for inmem layer")?;
 
         let last_record_lsn = self.get_last_record_lsn();
         ensure!(
@@ -3472,7 +3471,7 @@ impl Timeline {
                 self.conf,
                 self.timeline_id,
                 self.tenant_shard_id,
-                gate_guard,
+                &self.gate,
                 ctx,
             )
             .await?;
diff --git a/pageserver/src/tenant/timeline/layer_manager.rs b/pageserver/src/tenant/timeline/layer_manager.rs
index 4293a44dca25..3888e7f86a9b 100644
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -182,7 +182,7 @@ impl OpenLayerManager {
         conf: &'static PageServerConf,
         timeline_id: TimelineId,
         tenant_shard_id: TenantShardId,
-        gate_guard: utils::sync::gate::GateGuard,
+        gate: &utils::sync::gate::Gate,
         ctx: &RequestContext,
     ) -> anyhow::Result<Arc<InMemoryLayer>> {
         ensure!(lsn.is_aligned());
@@ -212,15 +212,9 @@ impl OpenLayerManager {
                 lsn
             );
 
-            let new_layer = InMemoryLayer::create(
-                conf,
-                timeline_id,
-                tenant_shard_id,
-                start_lsn,
-                gate_guard,
-                ctx,
-            )
-            .await?;
+            let new_layer =
+                InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn, gate, ctx)
+                    .await?;
             let layer = Arc::new(new_layer);
 
             self.layer_map.open_layer = Some(layer.clone());
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index b9f8c7ea2024..8a7f4a4bf5fd 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -20,7 +20,7 @@ use camino::{Utf8Path, Utf8PathBuf};
 use once_cell::sync::OnceCell;
 use owned_buffers_io::aligned_buffer::buffer::AlignedBuffer;
 use owned_buffers_io::aligned_buffer::{AlignedBufferMut, AlignedSlice, ConstAlign};
-use owned_buffers_io::io_buf_aligned::IoBufAlignedMut;
+use owned_buffers_io::io_buf_aligned::{IoBufAligned, IoBufAlignedMut};
 use owned_buffers_io::io_buf_ext::FullSlice;
 use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use pageserver_api::shard::TenantShardId;
@@ -63,9 +63,6 @@ pub(crate) mod owned_buffers_io {
     pub(crate) mod io_buf_ext;
     pub(crate) mod slice;
     pub(crate) mod write;
-    pub(crate) mod util {
-        pub(crate) mod size_tracking_writer;
-    }
 }
 
 #[derive(Debug)]
@@ -221,7 +218,7 @@ impl VirtualFile {
         self.inner.read_exact_at_page(page, offset, ctx).await
     }
 
-    pub async fn write_all_at<Buf: IoBuf + Send>(
+    pub async fn write_all_at<Buf: IoBufAligned + Send>(
         &self,
         buf: FullSlice<Buf>,
         offset: u64,
@@ -1325,14 +1322,14 @@ impl Drop for VirtualFileInner {
 }
 
 impl OwnedAsyncWriter for VirtualFile {
-    #[inline(always)]
-    async fn write_all<Buf: IoBuf + Send>(
-        &mut self,
+    async fn write_all_at<Buf: IoBufAligned + Send>(
+        &self,
         buf: FullSlice<Buf>,
+        offset: u64,
         ctx: &RequestContext,
-    ) -> std::io::Result<(usize, FullSlice<Buf>)> {
-        let (buf, res) = VirtualFile::write_all(self, buf, ctx).await;
-        res.map(move |v| (v, buf))
+    ) -> std::io::Result<FullSlice<Buf>> {
+        let (buf, res) = VirtualFile::write_all_at(self, buf, offset, ctx).await;
+        res.map(|_| buf)
     }
 }
 
@@ -1451,7 +1448,7 @@ mod tests {
                 }
             }
         }
-        async fn write_all_at<Buf: IoBuf + Send>(
+        async fn write_all_at<Buf: IoBufAligned + Send>(
             &self,
             buf: FullSlice<Buf>,
             offset: u64,
@@ -1594,6 +1591,7 @@ mod tests {
             &ctx,
         )
         .await?;
+
         file_a
             .write_all(b"foobar".to_vec().slice_len(), &ctx)
             .await?;
@@ -1652,10 +1650,10 @@ mod tests {
         )
         .await?;
         file_b
-            .write_all_at(b"BAR".to_vec().slice_len(), 3, &ctx)
+            .write_all_at(IoBuffer::from(b"BAR").slice_len(), 3, &ctx)
             .await?;
         file_b
-            .write_all_at(b"FOO".to_vec().slice_len(), 0, &ctx)
+            .write_all_at(IoBuffer::from(b"FOO").slice_len(), 0, &ctx)
             .await?;
 
         assert_eq!(file_b.read_string_at(2, 3, &ctx).await?, "OBA");
diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs
index 933b78a13b70..6b9992643f2a 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/alignment.rs
@@ -4,7 +4,7 @@ pub trait Alignment: std::marker::Unpin + 'static {
 }
 
 /// Alignment at compile time.
-#[derive(Debug)]
+#[derive(Debug, Clone, Copy)]
 pub struct ConstAlign<const A: usize>;
 
 impl<const A: usize> Alignment for ConstAlign<A> {
@@ -14,7 +14,7 @@ impl<const A: usize> Alignment for ConstAlign<A> {
 }
 
 /// Alignment at run time.
-#[derive(Debug)]
+#[derive(Debug, Clone, Copy)]
 pub struct RuntimeAlign {
     align: usize,
 }
diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs
index 2fba6d699b28..a5c26cd7463a 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer.rs
@@ -3,9 +3,10 @@ use std::{
     sync::Arc,
 };
 
-use super::{alignment::Alignment, raw::RawAlignedBuffer};
+use super::{alignment::Alignment, raw::RawAlignedBuffer, AlignedBufferMut, ConstAlign};
 
 /// An shared, immutable aligned buffer type.
+#[derive(Clone, Debug)]
 pub struct AlignedBuffer<A: Alignment> {
     /// Shared raw buffer.
     raw: Arc<RawAlignedBuffer<A>>,
@@ -86,6 +87,13 @@ impl<A: Alignment> AlignedBuffer<A> {
             range: begin..end,
         }
     }
+
+    /// Returns the mutable aligned buffer, if the immutable aligned buffer
+    /// has exactly one strong reference. Otherwise returns `None`.
+    pub fn into_mut(self) -> Option<AlignedBufferMut<A>> {
+        let raw = Arc::into_inner(self.raw)?;
+        Some(AlignedBufferMut::from_raw(raw))
+    }
 }
 
 impl<A: Alignment> Deref for AlignedBuffer<A> {
@@ -108,6 +116,14 @@ impl<A: Alignment> PartialEq<[u8]> for AlignedBuffer<A> {
     }
 }
 
+impl<const A: usize, const N: usize> From<&[u8; N]> for AlignedBuffer<ConstAlign<A>> {
+    fn from(value: &[u8; N]) -> Self {
+        let mut buf = AlignedBufferMut::with_capacity(N);
+        buf.extend_from_slice(value);
+        buf.freeze()
+    }
+}
+
 /// SAFETY: the underlying buffer references a stable memory region.
 unsafe impl<A: Alignment> tokio_epoll_uring::IoBuf for AlignedBuffer<A> {
     fn stable_ptr(&self) -> *const u8 {
diff --git a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs
index b3675d1aeabb..d2f5e206bb09 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/aligned_buffer/buffer_mut.rs
@@ -1,4 +1,7 @@
-use std::ops::{Deref, DerefMut};
+use std::{
+    mem::MaybeUninit,
+    ops::{Deref, DerefMut},
+};
 
 use super::{
     alignment::{Alignment, ConstAlign},
@@ -46,6 +49,11 @@ impl<const A: usize> AlignedBufferMut<ConstAlign<A>> {
 }
 
 impl<A: Alignment> AlignedBufferMut<A> {
+    /// Constructs a mutable aligned buffer from raw.
+    pub(super) fn from_raw(raw: RawAlignedBuffer<A>) -> Self {
+        AlignedBufferMut { raw }
+    }
+
     /// Returns the total number of bytes the buffer can hold.
     #[inline]
     pub fn capacity(&self) -> usize {
@@ -128,6 +136,39 @@ impl<A: Alignment> AlignedBufferMut<A> {
         let len = self.len();
         AlignedBuffer::from_raw(self.raw, 0..len)
     }
+
+    /// Clones and appends all elements in a slice to the buffer. Reserves additional capacity as needed.
+    #[inline]
+    pub fn extend_from_slice(&mut self, extend: &[u8]) {
+        let cnt = extend.len();
+        self.reserve(cnt);
+
+        // SAFETY: we already reserved additional `cnt` bytes, safe to perform memcpy.
+        unsafe {
+            let dst = self.spare_capacity_mut();
+            // Reserved above
+            debug_assert!(dst.len() >= cnt);
+
+            core::ptr::copy_nonoverlapping(extend.as_ptr(), dst.as_mut_ptr().cast(), cnt);
+        }
+        // SAFETY: We do have at least `cnt` bytes remaining before advance.
+        unsafe {
+            bytes::BufMut::advance_mut(self, cnt);
+        }
+    }
+
+    /// Returns the remaining spare capacity of the vector as a slice of `MaybeUninit<u8>`.
+    #[inline]
+    fn spare_capacity_mut(&mut self) -> &mut [MaybeUninit<u8>] {
+        // SAFETY: we guarantees that the `Self::capacity()` bytes from
+        // `Self::as_mut_ptr()` are allocated.
+        unsafe {
+            let ptr = self.as_mut_ptr().add(self.len());
+            let len = self.capacity() - self.len();
+
+            core::slice::from_raw_parts_mut(ptr.cast(), len)
+        }
+    }
 }
 
 impl<A: Alignment> Deref for AlignedBufferMut<A> {
diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs
index dba695196ebb..4ea6b1774447 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_aligned.rs
@@ -1,9 +1,15 @@
-use tokio_epoll_uring::IoBufMut;
+use tokio_epoll_uring::{IoBuf, IoBufMut};
 
-use crate::virtual_file::{IoBufferMut, PageWriteGuardBuf};
+use crate::virtual_file::{IoBuffer, IoBufferMut, PageWriteGuardBuf};
 
+/// A marker trait for a mutable aligned buffer type.
 pub trait IoBufAlignedMut: IoBufMut {}
 
+/// A marker trait for an aligned buffer type.
+pub trait IoBufAligned: IoBuf {}
+
 impl IoBufAlignedMut for IoBufferMut {}
 
+impl IoBufAligned for IoBuffer {}
+
 impl IoBufAlignedMut for PageWriteGuardBuf {}
diff --git a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs
index c3940cf6cea2..525f447b6dac 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/io_buf_ext.rs
@@ -5,6 +5,8 @@ use bytes::{Bytes, BytesMut};
 use std::ops::{Deref, Range};
 use tokio_epoll_uring::{BoundedBuf, IoBuf, Slice};
 
+use super::write::CheapCloneForRead;
+
 /// The true owned equivalent for Rust [`slice`]. Use this for the write path.
 ///
 /// Unlike [`tokio_epoll_uring::Slice`], which we unfortunately inherited from `tokio-uring`,
@@ -43,6 +45,18 @@ where
     }
 }
 
+impl<B> CheapCloneForRead for FullSlice<B>
+where
+    B: IoBuf + CheapCloneForRead,
+{
+    fn cheap_clone(&self) -> Self {
+        let bounds = self.slice.bounds();
+        let clone = self.slice.get_ref().cheap_clone();
+        let slice = clone.slice(bounds);
+        Self { slice }
+    }
+}
+
 pub(crate) trait IoBufExt {
     /// Get a [`FullSlice`] for the entire buffer, i.e., `self[..]` or `self[0..self.len()]`.
     fn slice_len(self) -> FullSlice<Self>
diff --git a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs b/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
deleted file mode 100644
index efcb61ba6532..000000000000
--- a/pageserver/src/virtual_file/owned_buffers_io/util/size_tracking_writer.rs
+++ /dev/null
@@ -1,50 +0,0 @@
-use crate::{
-    context::RequestContext,
-    virtual_file::owned_buffers_io::{io_buf_ext::FullSlice, write::OwnedAsyncWriter},
-};
-use tokio_epoll_uring::IoBuf;
-
-pub struct Writer<W> {
-    dst: W,
-    bytes_amount: u64,
-}
-
-impl<W> Writer<W> {
-    pub fn new(dst: W) -> Self {
-        Self {
-            dst,
-            bytes_amount: 0,
-        }
-    }
-
-    pub fn bytes_written(&self) -> u64 {
-        self.bytes_amount
-    }
-
-    pub fn as_inner(&self) -> &W {
-        &self.dst
-    }
-
-    /// Returns the wrapped `VirtualFile` object as well as the number
-    /// of bytes that were written to it through this object.
-    #[cfg_attr(target_os = "macos", allow(dead_code))]
-    pub fn into_inner(self) -> (u64, W) {
-        (self.bytes_amount, self.dst)
-    }
-}
-
-impl<W> OwnedAsyncWriter for Writer<W>
-where
-    W: OwnedAsyncWriter,
-{
-    #[inline(always)]
-    async fn write_all<Buf: IoBuf + Send>(
-        &mut self,
-        buf: FullSlice<Buf>,
-        ctx: &RequestContext,
-    ) -> std::io::Result<(usize, FullSlice<Buf>)> {
-        let (nwritten, buf) = self.dst.write_all(buf, ctx).await?;
-        self.bytes_amount += u64::try_from(nwritten).unwrap();
-        Ok((nwritten, buf))
-    }
-}
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs
index 568cf62e5617..20bf87812312 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/write.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -1,55 +1,88 @@
-use bytes::BytesMut;
+mod flush;
+use std::sync::Arc;
+
+use flush::FlushHandle;
 use tokio_epoll_uring::IoBuf;
 
-use crate::context::RequestContext;
+use crate::{
+    context::RequestContext,
+    virtual_file::{IoBuffer, IoBufferMut},
+};
+
+use super::{
+    io_buf_aligned::IoBufAligned,
+    io_buf_ext::{FullSlice, IoBufExt},
+};
 
-use super::io_buf_ext::{FullSlice, IoBufExt};
+pub(crate) use flush::FlushControl;
+
+pub(crate) trait CheapCloneForRead {
+    /// Returns a cheap clone of the buffer.
+    fn cheap_clone(&self) -> Self;
+}
+
+impl CheapCloneForRead for IoBuffer {
+    fn cheap_clone(&self) -> Self {
+        // Cheap clone over an `Arc`.
+        self.clone()
+    }
+}
 
 /// A trait for doing owned-buffer write IO.
 /// Think [`tokio::io::AsyncWrite`] but with owned buffers.
+/// The owned buffers need to be aligned due to Direct IO requirements.
 pub trait OwnedAsyncWriter {
-    async fn write_all<Buf: IoBuf + Send>(
-        &mut self,
+    fn write_all_at<Buf: IoBufAligned + Send>(
+        &self,
         buf: FullSlice<Buf>,
+        offset: u64,
         ctx: &RequestContext,
-    ) -> std::io::Result<(usize, FullSlice<Buf>)>;
+    ) -> impl std::future::Future<Output = std::io::Result<FullSlice<Buf>>> + Send;
 }
 
 /// A wrapper aorund an [`OwnedAsyncWriter`] that uses a [`Buffer`] to batch
 /// small writes into larger writes of size [`Buffer::cap`].
-///
-/// # Passthrough Of Large Writers
-///
-/// Calls to [`BufferedWriter::write_buffered`] that are larger than [`Buffer::cap`]
-/// cause the internal buffer to be flushed prematurely so that the large
-/// buffered write is passed through to the underlying [`OwnedAsyncWriter`].
-///
-/// This pass-through is generally beneficial for throughput, but if
-/// the storage backend of the [`OwnedAsyncWriter`] is a shared resource,
-/// unlimited large writes may cause latency or fairness issues.
-///
-/// In such cases, a different implementation that always buffers in memory
-/// may be preferable.
-pub struct BufferedWriter<B, W> {
-    writer: W,
+// TODO(yuchen): For large write, implementing buffer bypass for aligned parts of the write could be beneficial to throughput,
+// since we would avoid copying majority of the data into the internal buffer.
+pub struct BufferedWriter<B: Buffer, W> {
+    writer: Arc<W>,
     /// invariant: always remains Some(buf) except
     /// - while IO is ongoing => goes back to Some() once the IO completed successfully
     /// - after an IO error => stays `None` forever
     ///
     /// In these exceptional cases, it's `None`.
-    buf: Option<B>,
+    mutable: Option<B>,
+    /// A handle to the background flush task for writting data to disk.
+    flush_handle: FlushHandle<B::IoBuf, W>,
+    /// The number of bytes submitted to the background task.
+    bytes_submitted: u64,
 }
 
 impl<B, Buf, W> BufferedWriter<B, W>
 where
-    B: Buffer<IoBuf = Buf> + Send,
-    Buf: IoBuf + Send,
-    W: OwnedAsyncWriter,
+    B: Buffer<IoBuf = Buf> + Send + 'static,
+    Buf: IoBufAligned + Send + Sync + CheapCloneForRead,
+    W: OwnedAsyncWriter + Send + Sync + 'static + std::fmt::Debug,
 {
-    pub fn new(writer: W, buf: B) -> Self {
+    /// Creates a new buffered writer.
+    ///
+    /// The `buf_new` function provides a way to initialize the owned buffers used by this writer.
+    pub fn new(
+        writer: Arc<W>,
+        buf_new: impl Fn() -> B,
+        gate_guard: utils::sync::gate::GateGuard,
+        ctx: &RequestContext,
+    ) -> Self {
         Self {
-            writer,
-            buf: Some(buf),
+            writer: writer.clone(),
+            mutable: Some(buf_new()),
+            flush_handle: FlushHandle::spawn_new(
+                writer,
+                buf_new(),
+                gate_guard,
+                ctx.attached_child(),
+            ),
+            bytes_submitted: 0,
         }
     }
 
@@ -57,87 +90,70 @@ where
         &self.writer
     }
 
+    /// Returns the number of bytes submitted to the background flush task.
+    pub fn bytes_submitted(&self) -> u64 {
+        self.bytes_submitted
+    }
+
     /// Panics if used after any of the write paths returned an error
-    pub fn inspect_buffer(&self) -> &B {
-        self.buf()
+    pub fn inspect_mutable(&self) -> &B {
+        self.mutable()
+    }
+
+    /// Gets a reference to the maybe flushed read-only buffer.
+    /// Returns `None` if the writer has not submitted any flush request.
+    pub fn inspect_maybe_flushed(&self) -> Option<&FullSlice<Buf>> {
+        self.flush_handle.maybe_flushed.as_ref()
     }
 
     #[cfg_attr(target_os = "macos", allow(dead_code))]
-    pub async fn flush_and_into_inner(mut self, ctx: &RequestContext) -> std::io::Result<W> {
+    pub async fn flush_and_into_inner(
+        mut self,
+        ctx: &RequestContext,
+    ) -> std::io::Result<(u64, Arc<W>)> {
         self.flush(ctx).await?;
 
-        let Self { buf, writer } = self;
+        let Self {
+            mutable: buf,
+            writer,
+            mut flush_handle,
+            bytes_submitted: bytes_amount,
+        } = self;
+        flush_handle.shutdown().await?;
         assert!(buf.is_some());
-        Ok(writer)
+        Ok((bytes_amount, writer))
     }
 
+    /// Gets a reference to the mutable in-memory buffer.
     #[inline(always)]
-    fn buf(&self) -> &B {
-        self.buf
+    fn mutable(&self) -> &B {
+        self.mutable
             .as_ref()
             .expect("must not use after we returned an error")
     }
 
-    /// Guarantees that if Ok() is returned, all bytes in `chunk` have been accepted.
-    #[cfg_attr(target_os = "macos", allow(dead_code))]
-    pub async fn write_buffered<S: IoBuf + Send>(
+    pub async fn write_buffered_borrowed(
         &mut self,
-        chunk: FullSlice<S>,
+        chunk: &[u8],
         ctx: &RequestContext,
-    ) -> std::io::Result<(usize, FullSlice<S>)> {
-        let chunk = chunk.into_raw_slice();
-
-        let chunk_len = chunk.len();
-        // avoid memcpy for the middle of the chunk
-        if chunk.len() >= self.buf().cap() {
-            self.flush(ctx).await?;
-            // do a big write, bypassing `buf`
-            assert_eq!(
-                self.buf
-                    .as_ref()
-                    .expect("must not use after an error")
-                    .pending(),
-                0
-            );
-            let (nwritten, chunk) = self
-                .writer
-                .write_all(FullSlice::must_new(chunk), ctx)
-                .await?;
-            assert_eq!(nwritten, chunk_len);
-            return Ok((nwritten, chunk));
-        }
-        // in-memory copy the < BUFFER_SIZED tail of the chunk
-        assert!(chunk.len() < self.buf().cap());
-        let mut slice = &chunk[..];
-        while !slice.is_empty() {
-            let buf = self.buf.as_mut().expect("must not use after an error");
-            let need = buf.cap() - buf.pending();
-            let have = slice.len();
-            let n = std::cmp::min(need, have);
-            buf.extend_from_slice(&slice[..n]);
-            slice = &slice[n..];
-            if buf.pending() >= buf.cap() {
-                assert_eq!(buf.pending(), buf.cap());
-                self.flush(ctx).await?;
-            }
+    ) -> std::io::Result<usize> {
+        let (len, control) = self.write_buffered_borrowed_controlled(chunk, ctx).await?;
+        if let Some(control) = control {
+            control.release().await;
         }
-        assert!(slice.is_empty(), "by now we should have drained the chunk");
-        Ok((chunk_len, FullSlice::must_new(chunk)))
+        Ok(len)
     }
 
-    /// Strictly less performant variant of [`Self::write_buffered`] that allows writing borrowed data.
-    ///
-    /// It is less performant because we always have to copy the borrowed data into the internal buffer
-    /// before we can do the IO. The [`Self::write_buffered`] can avoid this, which is more performant
-    /// for large writes.
-    pub async fn write_buffered_borrowed(
+    /// In addition to bytes submitted in this write, also returns a handle that can control the flush behavior.
+    pub(crate) async fn write_buffered_borrowed_controlled(
         &mut self,
         mut chunk: &[u8],
         ctx: &RequestContext,
-    ) -> std::io::Result<usize> {
+    ) -> std::io::Result<(usize, Option<FlushControl>)> {
         let chunk_len = chunk.len();
+        let mut control: Option<FlushControl> = None;
         while !chunk.is_empty() {
-            let buf = self.buf.as_mut().expect("must not use after an error");
+            let buf = self.mutable.as_mut().expect("must not use after an error");
             let need = buf.cap() - buf.pending();
             let have = chunk.len();
             let n = std::cmp::min(need, have);
@@ -145,26 +161,27 @@ where
             chunk = &chunk[n..];
             if buf.pending() >= buf.cap() {
                 assert_eq!(buf.pending(), buf.cap());
-                self.flush(ctx).await?;
+                if let Some(control) = control.take() {
+                    control.release().await;
+                }
+                control = self.flush(ctx).await?;
             }
         }
-        Ok(chunk_len)
+        Ok((chunk_len, control))
     }
 
-    async fn flush(&mut self, ctx: &RequestContext) -> std::io::Result<()> {
-        let buf = self.buf.take().expect("must not use after an error");
+    #[must_use = "caller must explcitly check the flush control"]
+    async fn flush(&mut self, _ctx: &RequestContext) -> std::io::Result<Option<FlushControl>> {
+        let buf = self.mutable.take().expect("must not use after an error");
         let buf_len = buf.pending();
         if buf_len == 0 {
-            self.buf = Some(buf);
-            return Ok(());
+            self.mutable = Some(buf);
+            return Ok(None);
         }
-        let slice = buf.flush();
-        let (nwritten, slice) = self.writer.write_all(slice, ctx).await?;
-        assert_eq!(nwritten, buf_len);
-        self.buf = Some(Buffer::reuse_after_flush(
-            slice.into_raw_slice().into_inner(),
-        ));
-        Ok(())
+        let (recycled, flush_control) = self.flush_handle.flush(buf, self.bytes_submitted).await?;
+        self.bytes_submitted += u64::try_from(buf_len).unwrap();
+        self.mutable = Some(recycled);
+        Ok(Some(flush_control))
     }
 }
 
@@ -192,64 +209,77 @@ pub trait Buffer {
     fn reuse_after_flush(iobuf: Self::IoBuf) -> Self;
 }
 
-impl Buffer for BytesMut {
-    type IoBuf = BytesMut;
+impl Buffer for IoBufferMut {
+    type IoBuf = IoBuffer;
 
-    #[inline(always)]
     fn cap(&self) -> usize {
         self.capacity()
     }
 
     fn extend_from_slice(&mut self, other: &[u8]) {
-        BytesMut::extend_from_slice(self, other)
+        if self.len() + other.len() > self.cap() {
+            panic!("Buffer capacity exceeded");
+        }
+
+        IoBufferMut::extend_from_slice(self, other);
     }
 
-    #[inline(always)]
     fn pending(&self) -> usize {
         self.len()
     }
 
-    fn flush(self) -> FullSlice<BytesMut> {
-        self.slice_len()
+    fn flush(self) -> FullSlice<Self::IoBuf> {
+        self.freeze().slice_len()
     }
 
-    fn reuse_after_flush(mut iobuf: BytesMut) -> Self {
-        iobuf.clear();
-        iobuf
-    }
-}
-
-impl OwnedAsyncWriter for Vec<u8> {
-    async fn write_all<Buf: IoBuf + Send>(
-        &mut self,
-        buf: FullSlice<Buf>,
-        _: &RequestContext,
-    ) -> std::io::Result<(usize, FullSlice<Buf>)> {
-        self.extend_from_slice(&buf[..]);
-        Ok((buf.len(), buf))
+    /// Caller should make sure that `iobuf` only have one strong reference before invoking this method.
+    fn reuse_after_flush(iobuf: Self::IoBuf) -> Self {
+        let mut recycled = iobuf
+            .into_mut()
+            .expect("buffer should only have one strong reference");
+        recycled.clear();
+        recycled
     }
 }
 
 #[cfg(test)]
 mod tests {
-    use bytes::BytesMut;
+    use std::sync::Mutex;
 
     use super::*;
     use crate::context::{DownloadBehavior, RequestContext};
     use crate::task_mgr::TaskKind;
 
-    #[derive(Default)]
+    #[derive(Default, Debug)]
     struct RecorderWriter {
-        writes: Vec<Vec<u8>>,
+        /// record bytes and write offsets.
+        writes: Mutex<Vec<(Vec<u8>, u64)>>,
     }
+
+    impl RecorderWriter {
+        /// Gets recorded bytes and write offsets.
+        fn get_writes(&self) -> Vec<Vec<u8>> {
+            self.writes
+                .lock()
+                .unwrap()
+                .iter()
+                .map(|(buf, _)| buf.clone())
+                .collect()
+        }
+    }
+
     impl OwnedAsyncWriter for RecorderWriter {
-        async fn write_all<Buf: IoBuf + Send>(
-            &mut self,
+        async fn write_all_at<Buf: IoBufAligned + Send>(
+            &self,
             buf: FullSlice<Buf>,
+            offset: u64,
             _: &RequestContext,
-        ) -> std::io::Result<(usize, FullSlice<Buf>)> {
-            self.writes.push(Vec::from(&buf[..]));
-            Ok((buf.len(), buf))
+        ) -> std::io::Result<FullSlice<Buf>> {
+            self.writes
+                .lock()
+                .unwrap()
+                .push((Vec::from(&buf[..]), offset));
+            Ok(buf)
         }
     }
 
@@ -257,71 +287,21 @@ mod tests {
         RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error)
     }
 
-    macro_rules! write {
-        ($writer:ident, $data:literal) => {{
-            $writer
-                .write_buffered(::bytes::Bytes::from_static($data).slice_len(), &test_ctx())
-                .await?;
-        }};
-    }
-
     #[tokio::test]
-    async fn test_buffered_writes_only() -> std::io::Result<()> {
-        let recorder = RecorderWriter::default();
-        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
-        write!(writer, b"a");
-        write!(writer, b"b");
-        write!(writer, b"c");
-        write!(writer, b"d");
-        write!(writer, b"e");
-        let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
-        assert_eq!(
-            recorder.writes,
-            vec![Vec::from(b"ab"), Vec::from(b"cd"), Vec::from(b"e")]
-        );
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_passthrough_writes_only() -> std::io::Result<()> {
-        let recorder = RecorderWriter::default();
-        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
-        write!(writer, b"abc");
-        write!(writer, b"de");
-        write!(writer, b"");
-        write!(writer, b"fghijk");
-        let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
-        assert_eq!(
-            recorder.writes,
-            vec![Vec::from(b"abc"), Vec::from(b"de"), Vec::from(b"fghijk")]
-        );
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_passthrough_write_with_nonempty_buffer() -> std::io::Result<()> {
-        let recorder = RecorderWriter::default();
-        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
-        write!(writer, b"a");
-        write!(writer, b"bc");
-        write!(writer, b"d");
-        write!(writer, b"e");
-        let recorder = writer.flush_and_into_inner(&test_ctx()).await?;
-        assert_eq!(
-            recorder.writes,
-            vec![Vec::from(b"a"), Vec::from(b"bc"), Vec::from(b"de")]
-        );
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn test_write_all_borrowed_always_goes_through_buffer() -> std::io::Result<()> {
+    async fn test_write_all_borrowed_always_goes_through_buffer() -> anyhow::Result<()> {
         let ctx = test_ctx();
         let ctx = &ctx;
-        let recorder = RecorderWriter::default();
-        let mut writer = BufferedWriter::new(recorder, BytesMut::with_capacity(2));
+        let recorder = Arc::new(RecorderWriter::default());
+        let gate = utils::sync::gate::Gate::default();
+        let mut writer = BufferedWriter::<_, RecorderWriter>::new(
+            recorder,
+            || IoBufferMut::with_capacity(2),
+            gate.enter()?,
+            ctx,
+        );
 
         writer.write_buffered_borrowed(b"abc", ctx).await?;
+        writer.write_buffered_borrowed(b"", ctx).await?;
         writer.write_buffered_borrowed(b"d", ctx).await?;
         writer.write_buffered_borrowed(b"e", ctx).await?;
         writer.write_buffered_borrowed(b"fg", ctx).await?;
@@ -329,9 +309,9 @@ mod tests {
         writer.write_buffered_borrowed(b"j", ctx).await?;
         writer.write_buffered_borrowed(b"klmno", ctx).await?;
 
-        let recorder = writer.flush_and_into_inner(ctx).await?;
+        let (_, recorder) = writer.flush_and_into_inner(ctx).await?;
         assert_eq!(
-            recorder.writes,
+            recorder.get_writes(),
             {
                 let expect: &[&[u8]] = &[b"ab", b"cd", b"ef", b"gh", b"ij", b"kl", b"mn", b"o"];
                 expect
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs
new file mode 100644
index 000000000000..9ce8b311bb5c
--- /dev/null
+++ b/pageserver/src/virtual_file/owned_buffers_io/write/flush.rs
@@ -0,0 +1,314 @@
+use std::sync::Arc;
+
+use utils::sync::duplex;
+
+use crate::{
+    context::RequestContext,
+    virtual_file::owned_buffers_io::{io_buf_aligned::IoBufAligned, io_buf_ext::FullSlice},
+};
+
+use super::{Buffer, CheapCloneForRead, OwnedAsyncWriter};
+
+/// A handle to the flush task.
+pub struct FlushHandle<Buf, W> {
+    inner: Option<FlushHandleInner<Buf, W>>,
+    /// Immutable buffer for serving tail reads.
+    /// `None` if no flush request has been submitted.
+    pub(super) maybe_flushed: Option<FullSlice<Buf>>,
+}
+
+pub struct FlushHandleInner<Buf, W> {
+    /// A bi-directional channel that sends (buffer, offset) for writes,
+    /// and receives recyled buffer.
+    channel: duplex::mpsc::Duplex<FlushRequest<Buf>, FullSlice<Buf>>,
+    /// Join handle for the background flush task.
+    join_handle: tokio::task::JoinHandle<std::io::Result<Arc<W>>>,
+}
+
+struct FlushRequest<Buf> {
+    slice: FullSlice<Buf>,
+    offset: u64,
+    #[cfg(test)]
+    ready_to_flush_rx: tokio::sync::oneshot::Receiver<()>,
+    #[cfg(test)]
+    done_flush_tx: tokio::sync::oneshot::Sender<()>,
+}
+
+/// Constructs a request and a control object for a new flush operation.
+#[cfg(not(test))]
+fn new_flush_op<Buf>(slice: FullSlice<Buf>, offset: u64) -> (FlushRequest<Buf>, FlushControl) {
+    let request = FlushRequest { slice, offset };
+    let control = FlushControl::untracked();
+
+    (request, control)
+}
+
+/// Constructs a request and a control object for a new flush operation.
+#[cfg(test)]
+fn new_flush_op<Buf>(slice: FullSlice<Buf>, offset: u64) -> (FlushRequest<Buf>, FlushControl) {
+    let (ready_to_flush_tx, ready_to_flush_rx) = tokio::sync::oneshot::channel();
+    let (done_flush_tx, done_flush_rx) = tokio::sync::oneshot::channel();
+    let control = FlushControl::not_started(ready_to_flush_tx, done_flush_rx);
+
+    let request = FlushRequest {
+        slice,
+        offset,
+        ready_to_flush_rx,
+        done_flush_tx,
+    };
+    (request, control)
+}
+
+/// A handle to a `FlushRequest` that allows unit tests precise control over flush behavior.
+#[cfg(test)]
+pub(crate) struct FlushControl {
+    not_started: FlushNotStarted,
+}
+
+#[cfg(not(test))]
+pub(crate) struct FlushControl;
+
+impl FlushControl {
+    #[cfg(test)]
+    fn not_started(
+        ready_to_flush_tx: tokio::sync::oneshot::Sender<()>,
+        done_flush_rx: tokio::sync::oneshot::Receiver<()>,
+    ) -> Self {
+        FlushControl {
+            not_started: FlushNotStarted {
+                ready_to_flush_tx,
+                done_flush_rx,
+            },
+        }
+    }
+
+    #[cfg(not(test))]
+    fn untracked() -> Self {
+        FlushControl
+    }
+
+    /// In tests, turn flush control into a not started state.
+    #[cfg(test)]
+    pub(crate) fn into_not_started(self) -> FlushNotStarted {
+        self.not_started
+    }
+
+    /// Release control to the submitted buffer.
+    ///
+    /// In `cfg(test)` environment, the buffer is guranteed to be flushed to disk after [`FlushControl::release`] is finishes execution.
+    pub async fn release(self) {
+        #[cfg(test)]
+        {
+            self.not_started
+                .ready_to_flush()
+                .wait_until_flush_is_done()
+                .await;
+        }
+    }
+}
+
+impl<Buf, W> FlushHandle<Buf, W>
+where
+    Buf: IoBufAligned + Send + Sync + CheapCloneForRead,
+    W: OwnedAsyncWriter + Send + Sync + 'static + std::fmt::Debug,
+{
+    /// Spawns a new background flush task and obtains a handle.
+    ///
+    /// Note: The background task so we do not need to explicitly maintain a queue of buffers.
+    pub fn spawn_new<B>(
+        file: Arc<W>,
+        buf: B,
+        gate_guard: utils::sync::gate::GateGuard,
+        ctx: RequestContext,
+    ) -> Self
+    where
+        B: Buffer<IoBuf = Buf> + Send + 'static,
+    {
+        // It is fine to buffer up to only 1 message. We only 1 message in-flight at a time.
+        let (front, back) = duplex::mpsc::channel(1);
+
+        let join_handle = tokio::spawn(async move {
+            FlushBackgroundTask::new(back, file, gate_guard, ctx)
+                .run(buf.flush())
+                .await
+        });
+
+        FlushHandle {
+            inner: Some(FlushHandleInner {
+                channel: front,
+                join_handle,
+            }),
+            maybe_flushed: None,
+        }
+    }
+
+    /// Submits a buffer to be flushed in the background task.
+    /// Returns a buffer that completed flushing for re-use, length reset to 0, capacity unchanged.
+    /// If `save_buf_for_read` is true, then we save the buffer in `Self::maybe_flushed`, otherwise
+    /// clear `maybe_flushed`.
+    pub async fn flush<B>(&mut self, buf: B, offset: u64) -> std::io::Result<(B, FlushControl)>
+    where
+        B: Buffer<IoBuf = Buf> + Send + 'static,
+    {
+        let slice = buf.flush();
+
+        // Saves a buffer for read while flushing. This also removes reference to the old buffer.
+        self.maybe_flushed = Some(slice.cheap_clone());
+
+        let (request, flush_control) = new_flush_op(slice, offset);
+
+        // Submits the buffer to the background task.
+        let submit = self.inner_mut().channel.send(request).await;
+        if submit.is_err() {
+            return self.handle_error().await;
+        }
+
+        // Wait for an available buffer from the background flush task.
+        // This is the BACKPRESSURE mechanism: if the flush task can't keep up,
+        // then the write path will eventually wait for it here.
+        let Some(recycled) = self.inner_mut().channel.recv().await else {
+            return self.handle_error().await;
+        };
+
+        // The only other place that could hold a reference to the recycled buffer
+        // is in `Self::maybe_flushed`, but we have already replace it with the new buffer.
+        let recycled = Buffer::reuse_after_flush(recycled.into_raw_slice().into_inner());
+        Ok((recycled, flush_control))
+    }
+
+    async fn handle_error<T>(&mut self) -> std::io::Result<T> {
+        Err(self
+            .shutdown()
+            .await
+            .expect_err("flush task only disconnects duplex if it exits with an error"))
+    }
+
+    /// Cleans up the channel, join the flush task.
+    pub async fn shutdown(&mut self) -> std::io::Result<Arc<W>> {
+        let handle = self
+            .inner
+            .take()
+            .expect("must not use after we returned an error");
+        drop(handle.channel.tx);
+        handle.join_handle.await.unwrap()
+    }
+
+    /// Gets a mutable reference to the inner handle. Panics if [`Self::inner`] is `None`.
+    /// This only happens if the handle is used after an error.
+    fn inner_mut(&mut self) -> &mut FlushHandleInner<Buf, W> {
+        self.inner
+            .as_mut()
+            .expect("must not use after we returned an error")
+    }
+}
+
+/// A background task for flushing data to disk.
+pub struct FlushBackgroundTask<Buf, W> {
+    /// A bi-directional channel that receives (buffer, offset) for writes,
+    /// and send back recycled buffer.
+    channel: duplex::mpsc::Duplex<FullSlice<Buf>, FlushRequest<Buf>>,
+    /// A writter for persisting data to disk.
+    writer: Arc<W>,
+    ctx: RequestContext,
+    /// Prevent timeline from shuting down until the flush background task finishes flushing all remaining buffers to disk.
+    _gate_guard: utils::sync::gate::GateGuard,
+}
+
+impl<Buf, W> FlushBackgroundTask<Buf, W>
+where
+    Buf: IoBufAligned + Send + Sync,
+    W: OwnedAsyncWriter + Sync + 'static,
+{
+    /// Creates a new background flush task.
+    fn new(
+        channel: duplex::mpsc::Duplex<FullSlice<Buf>, FlushRequest<Buf>>,
+        file: Arc<W>,
+        gate_guard: utils::sync::gate::GateGuard,
+        ctx: RequestContext,
+    ) -> Self {
+        FlushBackgroundTask {
+            channel,
+            writer: file,
+            _gate_guard: gate_guard,
+            ctx,
+        }
+    }
+
+    /// Runs the background flush task.
+    /// The passed in slice is immediately sent back to the flush handle through the duplex channel.
+    async fn run(mut self, slice: FullSlice<Buf>) -> std::io::Result<Arc<W>> {
+        // Sends the extra buffer back to the handle.
+        self.channel.send(slice).await.map_err(|_| {
+            std::io::Error::new(std::io::ErrorKind::BrokenPipe, "flush handle closed early")
+        })?;
+
+        //  Exit condition: channel is closed and there is no remaining buffer to be flushed
+        while let Some(request) = self.channel.recv().await {
+            #[cfg(test)]
+            {
+                // In test, wait for control to signal that we are ready to flush.
+                if request.ready_to_flush_rx.await.is_err() {
+                    tracing::debug!("control dropped");
+                }
+            }
+
+            // Write slice to disk at `offset`.
+            let slice = self
+                .writer
+                .write_all_at(request.slice, request.offset, &self.ctx)
+                .await?;
+
+            #[cfg(test)]
+            {
+                // In test, tell control we are done flushing buffer.
+                if request.done_flush_tx.send(()).is_err() {
+                    tracing::debug!("control dropped");
+                }
+            }
+
+            // Sends the buffer back to the handle for reuse. The handle is in charged of cleaning the buffer.
+            if self.channel.send(slice).await.is_err() {
+                // Although channel is closed. Still need to finish flushing the remaining buffers.
+                continue;
+            }
+        }
+
+        Ok(self.writer)
+    }
+}
+
+#[cfg(test)]
+pub(crate) struct FlushNotStarted {
+    ready_to_flush_tx: tokio::sync::oneshot::Sender<()>,
+    done_flush_rx: tokio::sync::oneshot::Receiver<()>,
+}
+
+#[cfg(test)]
+pub(crate) struct FlushInProgress {
+    done_flush_rx: tokio::sync::oneshot::Receiver<()>,
+}
+
+#[cfg(test)]
+pub(crate) struct FlushDone;
+
+#[cfg(test)]
+impl FlushNotStarted {
+    /// Signals the background task the buffer is ready to flush to disk.
+    pub fn ready_to_flush(self) -> FlushInProgress {
+        self.ready_to_flush_tx
+            .send(())
+            .map(|_| FlushInProgress {
+                done_flush_rx: self.done_flush_rx,
+            })
+            .unwrap()
+    }
+}
+
+#[cfg(test)]
+impl FlushInProgress {
+    /// Waits until background flush is done.
+    pub async fn wait_until_flush_is_done(self) -> FlushDone {
+        self.done_flush_rx.await.unwrap();
+        FlushDone
+    }
+}

From 0bab7e30863c7d41087decf351517c0fb5a2e1b5 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 4 Dec 2024 17:42:17 +0000
Subject: [PATCH 41/65] chore: update clap (#10009)

This updates clap to use a new version of anstream
---
 Cargo.lock | 62 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 33 insertions(+), 29 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 38158b7aec0c..de8785f87ed5 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -84,16 +84,16 @@ checksum = "4b46cbb362ab8752921c97e041f5e366ee6297bd428a31275b9fcf1e380f7299"
 
 [[package]]
 name = "anstream"
-version = "0.3.2"
+version = "0.6.15"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0ca84f3628370c59db74ee214b3263d58f9aadd9b4fe7e711fd87dc452b7f163"
+checksum = "64e15c1ab1f89faffbf04a634d5e1962e9074f2741eef6d97f3c4e322426d526"
 dependencies = [
  "anstyle",
  "anstyle-parse",
  "anstyle-query",
  "anstyle-wincon",
  "colorchoice",
- "is-terminal",
+ "is_terminal_polyfill",
  "utf8parse",
 ]
 
@@ -123,12 +123,12 @@ dependencies = [
 
 [[package]]
 name = "anstyle-wincon"
-version = "1.0.1"
+version = "3.0.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "180abfa45703aebe0093f79badacc01b8fd4ea2e35118747e5811127f926e188"
+checksum = "5bf74e1b6e971609db8ca7a9ce79fd5768ab6ae46441c572e46cf596f59e57f8"
 dependencies = [
  "anstyle",
- "windows-sys 0.48.0",
+ "windows-sys 0.52.0",
 ]
 
 [[package]]
@@ -1167,35 +1167,33 @@ dependencies = [
 
 [[package]]
 name = "clap"
-version = "4.3.0"
+version = "4.5.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "93aae7a4192245f70fe75dd9157fc7b4a5bf53e88d30bd4396f7d8f9284d5acc"
+checksum = "69371e34337c4c984bbe322360c2547210bf632eb2814bbe78a6e87a2935bd2b"
 dependencies = [
  "clap_builder",
  "clap_derive",
- "once_cell",
 ]
 
 [[package]]
 name = "clap_builder"
-version = "4.3.0"
+version = "4.5.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4f423e341edefb78c9caba2d9c7f7687d0e72e89df3ce3394554754393ac3990"
+checksum = "6e24c1b4099818523236a8ca881d2b45db98dadfb4625cf6608c12069fcbbde1"
 dependencies = [
  "anstream",
  "anstyle",
- "bitflags 1.3.2",
  "clap_lex",
- "strsim",
+ "strsim 0.11.1",
 ]
 
 [[package]]
 name = "clap_derive"
-version = "4.3.0"
+version = "4.5.18"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "191d9573962933b4027f932c600cd252ce27a8ad5979418fe78e43c07996f27b"
+checksum = "4ac6a0c7b1a9e9a5186361f67dfa1b88213572f427fb9ab038efb2bd8c582dab"
 dependencies = [
- "heck 0.4.1",
+ "heck",
  "proc-macro2",
  "quote",
  "syn 2.0.90",
@@ -1203,9 +1201,9 @@ dependencies = [
 
 [[package]]
 name = "clap_lex"
-version = "0.5.0"
+version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2da6da31387c7e4ef160ffab6d5e7f00c42626fe39aea70a7b0f1773f7dd6c1b"
+checksum = "afb84c814227b90d6895e01398aee0d8033c00e7466aca416fb6a8e0eb19d8a7"
 
 [[package]]
 name = "colorchoice"
@@ -1614,7 +1612,7 @@ dependencies = [
  "ident_case",
  "proc-macro2",
  "quote",
- "strsim",
+ "strsim 0.10.0",
  "syn 2.0.90",
 ]
 
@@ -1812,7 +1810,7 @@ checksum = "0892a17df262a24294c382f0d5997571006e7a4348b4327557c4ff1cd4a8bccc"
 dependencies = [
  "darling",
  "either",
- "heck 0.5.0",
+ "heck",
  "proc-macro2",
  "quote",
  "syn 2.0.90",
@@ -2465,12 +2463,6 @@ dependencies = [
  "num-traits",
 ]
 
-[[package]]
-name = "heck"
-version = "0.4.1"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "95505c38b4572b2d910cecb0281560f54b440a19336cbbcb27bf6ce6adc6f5a8"
-
 [[package]]
 name = "heck"
 version = "0.5.0"
@@ -2888,6 +2880,12 @@ dependencies = [
  "windows-sys 0.52.0",
 ]
 
+[[package]]
+name = "is_terminal_polyfill"
+version = "1.70.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf"
+
 [[package]]
 name = "itertools"
 version = "0.10.5"
@@ -3169,7 +3167,7 @@ version = "0.0.22"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b9e6777fc80a575f9503d908c8b498782a6c3ee88a06cb416dc3941401e43b94"
 dependencies = [
- "heck 0.5.0",
+ "heck",
  "proc-macro2",
  "quote",
  "syn 2.0.90",
@@ -4458,7 +4456,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0c1318b19085f08681016926435853bbf7858f9c082d0999b80550ff5d9abe15"
 dependencies = [
  "bytes",
- "heck 0.5.0",
+ "heck",
  "itertools 0.12.1",
  "log",
  "multimap",
@@ -6166,6 +6164,12 @@ version = "0.10.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
 
+[[package]]
+name = "strsim"
+version = "0.11.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
+
 [[package]]
 name = "strum"
 version = "0.26.3"
@@ -6178,7 +6182,7 @@ version = "0.26.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4c6bee85a5a24955dc440386795aa378cd9cf82acd5f764469152d2270e581be"
 dependencies = [
- "heck 0.5.0",
+ "heck",
  "proc-macro2",
  "quote",
  "rustversion",

From 131585eb6bd206907a969f8eab44017b282d1556 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 4 Dec 2024 21:07:44 +0000
Subject: [PATCH 42/65] chore: update rust-postgres (#10002)

Like #9931 but without rebasing upstream just yet, to try and minimise
the differences.

Removes all proxy-specific commits from the rust-postgres fork, now that
proxy no longer depends on them. Merging upstream changes to come later.
---
 Cargo.lock | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index de8785f87ed5..62f06d45bd52 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -4169,7 +4169,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -4182,7 +4182,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
 dependencies = [
  "base64 0.20.0",
  "byteorder",
@@ -4195,7 +4195,6 @@ dependencies = [
  "rand 0.8.5",
  "sha2",
  "stringprep",
- "tokio",
 ]
 
 [[package]]
@@ -4217,7 +4216,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -6547,7 +6546,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#00940fcdb57a8e99e805297b75839e7c4c7b1796"
+source = "git+https://github.com/neondatabase/rust-postgres.git?branch=neon#511f998c00148ab7c847bd7e6cfd3a906d0e7473"
 dependencies = [
  "async-trait",
  "byteorder",

From ed2d89211306ca892dce41159bc1cc8e9e1646a5 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Wed, 4 Dec 2024 21:16:09 -0500
Subject: [PATCH 43/65] pageserver: fix buffered-writer on macos build (#10019)

## Problem

In https://github.com/neondatabase/neon/pull/9693, we forgot to check
macos build. The [CI
run](https://github.com/neondatabase/neon/actions/runs/12164541897/job/33926455468)
on main showed that macos build failed with unused variables and dead
code.

## Summary of changes

- add `allow(dead_code)` and `allow(unused_variables)` to the relevant
code that is not used on macos.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 pageserver/src/tenant/remote_timeline_client/download.rs | 7 ++++---
 pageserver/src/virtual_file/owned_buffers_io/write.rs    | 1 +
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client/download.rs b/pageserver/src/tenant/remote_timeline_client/download.rs
index c5ae466f3afb..d15f161fb6da 100644
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -6,7 +6,6 @@
 use std::collections::HashSet;
 use std::future::Future;
 use std::str::FromStr;
-use std::sync::Arc;
 use std::time::SystemTime;
 
 use anyhow::{anyhow, Context};
@@ -27,7 +26,7 @@ use crate::span::{
 use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerName;
 use crate::tenant::Generation;
-use crate::virtual_file::{on_fatal_io_error, IoBufferMut, MaybeFatalIo, VirtualFile};
+use crate::virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile};
 use crate::TEMP_FILE_SUFFIX;
 use remote_storage::{
     DownloadError, DownloadKind, DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath,
@@ -150,7 +149,7 @@ async fn download_object<'a>(
     storage: &'a GenericRemoteStorage,
     src_path: &RemotePath,
     dst_path: &Utf8PathBuf,
-    gate: &utils::sync::gate::Gate,
+    #[cfg_attr(target_os = "macos", allow(unused_variables))] gate: &utils::sync::gate::Gate,
     cancel: &CancellationToken,
     #[cfg_attr(target_os = "macos", allow(unused_variables))] ctx: &RequestContext,
 ) -> Result<u64, DownloadError> {
@@ -209,6 +208,8 @@ async fn download_object<'a>(
         #[cfg(target_os = "linux")]
         crate::virtual_file::io_engine::IoEngine::TokioEpollUring => {
             use crate::virtual_file::owned_buffers_io;
+            use crate::virtual_file::IoBufferMut;
+            use std::sync::Arc;
             async {
                 let destination_file = Arc::new(
                     VirtualFile::create(dst_path, ctx)
diff --git a/pageserver/src/virtual_file/owned_buffers_io/write.rs b/pageserver/src/virtual_file/owned_buffers_io/write.rs
index 20bf87812312..7299d8370301 100644
--- a/pageserver/src/virtual_file/owned_buffers_io/write.rs
+++ b/pageserver/src/virtual_file/owned_buffers_io/write.rs
@@ -132,6 +132,7 @@ where
             .expect("must not use after we returned an error")
     }
 
+    #[cfg_attr(target_os = "macos", allow(dead_code))]
     pub async fn write_buffered_borrowed(
         &mut self,
         chunk: &[u8],

From ffc9c33eb2383f9970a246bce8712772c7696080 Mon Sep 17 00:00:00 2001
From: Ivan Efremov <ivan@neon.tech>
Date: Thu, 5 Dec 2024 07:30:38 +0200
Subject: [PATCH 44/65] proxy: Present new auth backend cplane_proxy_v1
 (#10012)

Implement a new auth backend based on the current Neon backend to switch
to the new Proxy V1 cplane API.

Implements [#21048](https://github.com/neondatabase/cloud/issues/21048)
---
 proxy/src/auth/backend/mod.rs                 |   4 +
 proxy/src/bin/proxy.rs                        |  99 +++-
 .../control_plane/client/cplane_proxy_v1.rs   | 514 ++++++++++++++++++
 proxy/src/control_plane/client/mod.rs         |   7 +
 proxy/src/control_plane/client/neon.rs        |   2 +-
 proxy/src/control_plane/messages.rs           |  10 +
 6 files changed, 634 insertions(+), 2 deletions(-)
 create mode 100644 proxy/src/control_plane/client/cplane_proxy_v1.rs

diff --git a/proxy/src/auth/backend/mod.rs b/proxy/src/auth/backend/mod.rs
index 84a572dcf9f1..1bad7b308623 100644
--- a/proxy/src/auth/backend/mod.rs
+++ b/proxy/src/auth/backend/mod.rs
@@ -70,6 +70,10 @@ impl std::fmt::Display for Backend<'_, ()> {
     fn fmt(&self, fmt: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         match self {
             Self::ControlPlane(api, ()) => match &**api {
+                ControlPlaneClient::ProxyV1(endpoint) => fmt
+                    .debug_tuple("ControlPlane::ProxyV1")
+                    .field(&endpoint.url())
+                    .finish(),
                 ControlPlaneClient::Neon(endpoint) => fmt
                     .debug_tuple("ControlPlane::Neon")
                     .field(&endpoint.url())
diff --git a/proxy/src/bin/proxy.rs b/proxy/src/bin/proxy.rs
index c929b97d78a5..99144acef094 100644
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -46,6 +46,9 @@ enum AuthBackendType {
     #[value(name("console"), alias("cplane"))]
     ControlPlane,
 
+    #[value(name("cplane-v1"), alias("control-plane"))]
+    ControlPlaneV1,
+
     #[value(name("link"), alias("control-redirect"))]
     ConsoleRedirect,
 
@@ -518,6 +521,39 @@ async fn main() -> anyhow::Result<()> {
                         .instrument(span),
                 );
             }
+        } else if let proxy::control_plane::client::ControlPlaneClient::ProxyV1(api) = &**api {
+            match (redis_notifications_client, regional_redis_client.clone()) {
+                (None, None) => {}
+                (client1, client2) => {
+                    let cache = api.caches.project_info.clone();
+                    if let Some(client) = client1 {
+                        maintenance_tasks.spawn(notifications::task_main(
+                            client,
+                            cache.clone(),
+                            cancel_map.clone(),
+                            args.region.clone(),
+                        ));
+                    }
+                    if let Some(client) = client2 {
+                        maintenance_tasks.spawn(notifications::task_main(
+                            client,
+                            cache.clone(),
+                            cancel_map.clone(),
+                            args.region.clone(),
+                        ));
+                    }
+                    maintenance_tasks.spawn(async move { cache.clone().gc_worker().await });
+                }
+            }
+            if let Some(regional_redis_client) = regional_redis_client {
+                let cache = api.caches.endpoints_cache.clone();
+                let con = regional_redis_client;
+                let span = tracing::info_span!("endpoints_cache");
+                maintenance_tasks.spawn(
+                    async move { cache.do_read(con, cancellation_token.clone()).await }
+                        .instrument(span),
+                );
+            }
         }
     }
 
@@ -662,6 +698,65 @@ fn build_auth_backend(
     args: &ProxyCliArgs,
 ) -> anyhow::Result<Either<&'static auth::Backend<'static, ()>, &'static ConsoleRedirectBackend>> {
     match &args.auth_backend {
+        AuthBackendType::ControlPlaneV1 => {
+            let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
+            let project_info_cache_config: ProjectInfoCacheOptions =
+                args.project_info_cache.parse()?;
+            let endpoint_cache_config: config::EndpointCacheConfig =
+                args.endpoint_cache_config.parse()?;
+
+            info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
+            info!(
+                "Using AllowedIpsCache (wake_compute) with options={project_info_cache_config:?}"
+            );
+            info!("Using EndpointCacheConfig with options={endpoint_cache_config:?}");
+            let caches = Box::leak(Box::new(control_plane::caches::ApiCaches::new(
+                wake_compute_cache_config,
+                project_info_cache_config,
+                endpoint_cache_config,
+            )));
+
+            let config::ConcurrencyLockOptions {
+                shards,
+                limiter,
+                epoch,
+                timeout,
+            } = args.wake_compute_lock.parse()?;
+            info!(?limiter, shards, ?epoch, "Using NodeLocks (wake_compute)");
+            let locks = Box::leak(Box::new(control_plane::locks::ApiLocks::new(
+                "wake_compute_lock",
+                limiter,
+                shards,
+                timeout,
+                epoch,
+                &Metrics::get().wake_compute_lock,
+            )?));
+            tokio::spawn(locks.garbage_collect_worker());
+
+            let url: proxy::url::ApiUrl = args.auth_endpoint.parse()?;
+
+            let endpoint = http::Endpoint::new(url, http::new_client());
+
+            let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
+            RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
+            let wake_compute_endpoint_rate_limiter =
+                Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
+
+            let api = control_plane::client::cplane_proxy_v1::NeonControlPlaneClient::new(
+                endpoint,
+                args.control_plane_token.clone(),
+                caches,
+                locks,
+                wake_compute_endpoint_rate_limiter,
+            );
+
+            let api = control_plane::client::ControlPlaneClient::ProxyV1(api);
+            let auth_backend = auth::Backend::ControlPlane(MaybeOwned::Owned(api), ());
+            let config = Box::leak(Box::new(auth_backend));
+
+            Ok(Either::Left(config))
+        }
+
         AuthBackendType::ControlPlane => {
             let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
             let project_info_cache_config: ProjectInfoCacheOptions =
@@ -697,13 +792,15 @@ fn build_auth_backend(
             )?));
             tokio::spawn(locks.garbage_collect_worker());
 
-            let url = args.auth_endpoint.parse()?;
+            let url: proxy::url::ApiUrl = args.auth_endpoint.parse()?;
+
             let endpoint = http::Endpoint::new(url, http::new_client());
 
             let mut wake_compute_rps_limit = args.wake_compute_limit.clone();
             RateBucketInfo::validate(&mut wake_compute_rps_limit)?;
             let wake_compute_endpoint_rate_limiter =
                 Arc::new(WakeComputeRateLimiter::new(wake_compute_rps_limit));
+
             let api = control_plane::client::neon::NeonControlPlaneClient::new(
                 endpoint,
                 args.control_plane_token.clone(),
diff --git a/proxy/src/control_plane/client/cplane_proxy_v1.rs b/proxy/src/control_plane/client/cplane_proxy_v1.rs
new file mode 100644
index 000000000000..e33a37f64366
--- /dev/null
+++ b/proxy/src/control_plane/client/cplane_proxy_v1.rs
@@ -0,0 +1,514 @@
+//! Production console backend.
+
+use std::sync::Arc;
+use std::time::Duration;
+
+use ::http::header::AUTHORIZATION;
+use ::http::HeaderName;
+use futures::TryFutureExt;
+use postgres_client::config::SslMode;
+use tokio::time::Instant;
+use tracing::{debug, info, info_span, warn, Instrument};
+
+use super::super::messages::{ControlPlaneErrorMessage, GetEndpointAccessControl, WakeCompute};
+use crate::auth::backend::jwt::AuthRule;
+use crate::auth::backend::ComputeUserInfo;
+use crate::cache::Cached;
+use crate::context::RequestContext;
+use crate::control_plane::caches::ApiCaches;
+use crate::control_plane::errors::{
+    ControlPlaneError, GetAuthInfoError, GetEndpointJwksError, WakeComputeError,
+};
+use crate::control_plane::locks::ApiLocks;
+use crate::control_plane::messages::{ColdStartInfo, EndpointJwksResponse, Reason};
+use crate::control_plane::{
+    AuthInfo, AuthSecret, CachedAllowedIps, CachedNodeInfo, CachedRoleSecret, NodeInfo,
+};
+use crate::metrics::{CacheOutcome, Metrics};
+use crate::rate_limiter::WakeComputeRateLimiter;
+use crate::types::{EndpointCacheKey, EndpointId};
+use crate::{compute, http, scram};
+
+const X_REQUEST_ID: HeaderName = HeaderName::from_static("x-request-id");
+
+#[derive(Clone)]
+pub struct NeonControlPlaneClient {
+    endpoint: http::Endpoint,
+    pub caches: &'static ApiCaches,
+    pub(crate) locks: &'static ApiLocks<EndpointCacheKey>,
+    pub(crate) wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
+    // put in a shared ref so we don't copy secrets all over in memory
+    jwt: Arc<str>,
+}
+
+impl NeonControlPlaneClient {
+    /// Construct an API object containing the auth parameters.
+    pub fn new(
+        endpoint: http::Endpoint,
+        jwt: Arc<str>,
+        caches: &'static ApiCaches,
+        locks: &'static ApiLocks<EndpointCacheKey>,
+        wake_compute_endpoint_rate_limiter: Arc<WakeComputeRateLimiter>,
+    ) -> Self {
+        Self {
+            endpoint,
+            caches,
+            locks,
+            wake_compute_endpoint_rate_limiter,
+            jwt,
+        }
+    }
+
+    pub(crate) fn url(&self) -> &str {
+        self.endpoint.url().as_str()
+    }
+
+    async fn do_get_auth_info(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<AuthInfo, GetAuthInfoError> {
+        if !self
+            .caches
+            .endpoints_cache
+            .is_valid(ctx, &user_info.endpoint.normalize())
+        {
+            // TODO: refactor this because it's weird
+            // this is a failure to authenticate but we return Ok.
+            info!("endpoint is not valid, skipping the request");
+            return Ok(AuthInfo::default());
+        }
+        let request_id = ctx.session_id().to_string();
+        let application_name = ctx.console_application_name();
+        async {
+            let request = self
+                .endpoint
+                .get_path("get_endpoint_access_control")
+                .header(X_REQUEST_ID, &request_id)
+                .header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
+                .query(&[("session_id", ctx.session_id())])
+                .query(&[
+                    ("application_name", application_name.as_str()),
+                    ("endpointish", user_info.endpoint.as_str()),
+                    ("role", user_info.user.as_str()),
+                ])
+                .build()?;
+
+            debug!(url = request.url().as_str(), "sending http request");
+            let start = Instant::now();
+            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
+            let response = self.endpoint.execute(request).await?;
+            drop(pause);
+            info!(duration = ?start.elapsed(), "received http response");
+            let body = match parse_body::<GetEndpointAccessControl>(response).await {
+                Ok(body) => body,
+                // Error 404 is special: it's ok not to have a secret.
+                // TODO(anna): retry
+                Err(e) => {
+                    return if e.get_reason().is_not_found() {
+                        // TODO: refactor this because it's weird
+                        // this is a failure to authenticate but we return Ok.
+                        Ok(AuthInfo::default())
+                    } else {
+                        Err(e.into())
+                    };
+                }
+            };
+
+            // Ivan: don't know where it will be used, so I leave it here
+            let _endpoint_vpc_ids = body.allowed_vpc_endpoint_ids.unwrap_or_default();
+
+            let secret = if body.role_secret.is_empty() {
+                None
+            } else {
+                let secret = scram::ServerSecret::parse(&body.role_secret)
+                    .map(AuthSecret::Scram)
+                    .ok_or(GetAuthInfoError::BadSecret)?;
+                Some(secret)
+            };
+            let allowed_ips = body.allowed_ips.unwrap_or_default();
+            Metrics::get()
+                .proxy
+                .allowed_ips_number
+                .observe(allowed_ips.len() as f64);
+            Ok(AuthInfo {
+                secret,
+                allowed_ips,
+                project_id: body.project_id,
+            })
+        }
+        .inspect_err(|e| tracing::debug!(error = ?e))
+        .instrument(info_span!("do_get_auth_info"))
+        .await
+    }
+
+    async fn do_get_endpoint_jwks(
+        &self,
+        ctx: &RequestContext,
+        endpoint: EndpointId,
+    ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
+        if !self
+            .caches
+            .endpoints_cache
+            .is_valid(ctx, &endpoint.normalize())
+        {
+            return Err(GetEndpointJwksError::EndpointNotFound);
+        }
+        let request_id = ctx.session_id().to_string();
+        async {
+            let request = self
+                .endpoint
+                .get_with_url(|url| {
+                    url.path_segments_mut()
+                        .push("endpoints")
+                        .push(endpoint.as_str())
+                        .push("jwks");
+                })
+                .header(X_REQUEST_ID, &request_id)
+                .header(AUTHORIZATION, format!("Bearer {}", &self.jwt))
+                .query(&[("session_id", ctx.session_id())])
+                .build()
+                .map_err(GetEndpointJwksError::RequestBuild)?;
+
+            debug!(url = request.url().as_str(), "sending http request");
+            let start = Instant::now();
+            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
+            let response = self
+                .endpoint
+                .execute(request)
+                .await
+                .map_err(GetEndpointJwksError::RequestExecute)?;
+            drop(pause);
+            info!(duration = ?start.elapsed(), "received http response");
+
+            let body = parse_body::<EndpointJwksResponse>(response).await?;
+
+            let rules = body
+                .jwks
+                .into_iter()
+                .map(|jwks| AuthRule {
+                    id: jwks.id,
+                    jwks_url: jwks.jwks_url,
+                    audience: jwks.jwt_audience,
+                    role_names: jwks.role_names,
+                })
+                .collect();
+
+            Ok(rules)
+        }
+        .inspect_err(|e| tracing::debug!(error = ?e))
+        .instrument(info_span!("do_get_endpoint_jwks"))
+        .await
+    }
+
+    async fn do_wake_compute(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<NodeInfo, WakeComputeError> {
+        let request_id = ctx.session_id().to_string();
+        let application_name = ctx.console_application_name();
+        async {
+            let mut request_builder = self
+                .endpoint
+                .get_path("wake_compute")
+                .header("X-Request-ID", &request_id)
+                .header("Authorization", format!("Bearer {}", &self.jwt))
+                .query(&[("session_id", ctx.session_id())])
+                .query(&[
+                    ("application_name", application_name.as_str()),
+                    ("endpointish", user_info.endpoint.as_str()),
+                ]);
+
+            let options = user_info.options.to_deep_object();
+            if !options.is_empty() {
+                request_builder = request_builder.query(&options);
+            }
+
+            let request = request_builder.build()?;
+
+            debug!(url = request.url().as_str(), "sending http request");
+            let start = Instant::now();
+            let pause = ctx.latency_timer_pause(crate::metrics::Waiting::Cplane);
+            let response = self.endpoint.execute(request).await?;
+            drop(pause);
+            info!(duration = ?start.elapsed(), "received http response");
+            let body = parse_body::<WakeCompute>(response).await?;
+
+            // Unfortunately, ownership won't let us use `Option::ok_or` here.
+            let (host, port) = match parse_host_port(&body.address) {
+                None => return Err(WakeComputeError::BadComputeAddress(body.address)),
+                Some(x) => x,
+            };
+
+            // Don't set anything but host and port! This config will be cached.
+            // We'll set username and such later using the startup message.
+            // TODO: add more type safety (in progress).
+            let mut config = compute::ConnCfg::new(host.to_owned(), port);
+            config.ssl_mode(SslMode::Disable); // TLS is not configured on compute nodes.
+
+            let node = NodeInfo {
+                config,
+                aux: body.aux,
+                allow_self_signed_compute: false,
+            };
+
+            Ok(node)
+        }
+        .inspect_err(|e| tracing::debug!(error = ?e))
+        .instrument(info_span!("do_wake_compute"))
+        .await
+    }
+}
+
+impl super::ControlPlaneApi for NeonControlPlaneClient {
+    #[tracing::instrument(skip_all)]
+    async fn get_role_secret(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedRoleSecret, GetAuthInfoError> {
+        let normalized_ep = &user_info.endpoint.normalize();
+        let user = &user_info.user;
+        if let Some(role_secret) = self
+            .caches
+            .project_info
+            .get_role_secret(normalized_ep, user)
+        {
+            return Ok(role_secret);
+        }
+        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
+        if let Some(project_id) = auth_info.project_id {
+            let normalized_ep_int = normalized_ep.into();
+            self.caches.project_info.insert_role_secret(
+                project_id,
+                normalized_ep_int,
+                user.into(),
+                auth_info.secret.clone(),
+            );
+            self.caches.project_info.insert_allowed_ips(
+                project_id,
+                normalized_ep_int,
+                Arc::new(auth_info.allowed_ips),
+            );
+            ctx.set_project_id(project_id);
+        }
+        // When we just got a secret, we don't need to invalidate it.
+        Ok(Cached::new_uncached(auth_info.secret))
+    }
+
+    async fn get_allowed_ips_and_secret(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), GetAuthInfoError> {
+        let normalized_ep = &user_info.endpoint.normalize();
+        if let Some(allowed_ips) = self.caches.project_info.get_allowed_ips(normalized_ep) {
+            Metrics::get()
+                .proxy
+                .allowed_ips_cache_misses
+                .inc(CacheOutcome::Hit);
+            return Ok((allowed_ips, None));
+        }
+        Metrics::get()
+            .proxy
+            .allowed_ips_cache_misses
+            .inc(CacheOutcome::Miss);
+        let auth_info = self.do_get_auth_info(ctx, user_info).await?;
+        let allowed_ips = Arc::new(auth_info.allowed_ips);
+        let user = &user_info.user;
+        if let Some(project_id) = auth_info.project_id {
+            let normalized_ep_int = normalized_ep.into();
+            self.caches.project_info.insert_role_secret(
+                project_id,
+                normalized_ep_int,
+                user.into(),
+                auth_info.secret.clone(),
+            );
+            self.caches.project_info.insert_allowed_ips(
+                project_id,
+                normalized_ep_int,
+                allowed_ips.clone(),
+            );
+            ctx.set_project_id(project_id);
+        }
+        Ok((
+            Cached::new_uncached(allowed_ips),
+            Some(Cached::new_uncached(auth_info.secret)),
+        ))
+    }
+
+    #[tracing::instrument(skip_all)]
+    async fn get_endpoint_jwks(
+        &self,
+        ctx: &RequestContext,
+        endpoint: EndpointId,
+    ) -> Result<Vec<AuthRule>, GetEndpointJwksError> {
+        self.do_get_endpoint_jwks(ctx, endpoint).await
+    }
+
+    #[tracing::instrument(skip_all)]
+    async fn wake_compute(
+        &self,
+        ctx: &RequestContext,
+        user_info: &ComputeUserInfo,
+    ) -> Result<CachedNodeInfo, WakeComputeError> {
+        let key = user_info.endpoint_cache_key();
+
+        macro_rules! check_cache {
+            () => {
+                if let Some(cached) = self.caches.node_info.get(&key) {
+                    let (cached, info) = cached.take_value();
+                    let info = info.map_err(|c| {
+                        info!(key = &*key, "found cached wake_compute error");
+                        WakeComputeError::ControlPlane(ControlPlaneError::Message(Box::new(*c)))
+                    })?;
+
+                    debug!(key = &*key, "found cached compute node info");
+                    ctx.set_project(info.aux.clone());
+                    return Ok(cached.map(|()| info));
+                }
+            };
+        }
+
+        // Every time we do a wakeup http request, the compute node will stay up
+        // for some time (highly depends on the console's scale-to-zero policy);
+        // The connection info remains the same during that period of time,
+        // which means that we might cache it to reduce the load and latency.
+        check_cache!();
+
+        let permit = self.locks.get_permit(&key).await?;
+
+        // after getting back a permit - it's possible the cache was filled
+        // double check
+        if permit.should_check_cache() {
+            // TODO: if there is something in the cache, mark the permit as success.
+            check_cache!();
+        }
+
+        // check rate limit
+        if !self
+            .wake_compute_endpoint_rate_limiter
+            .check(user_info.endpoint.normalize_intern(), 1)
+        {
+            return Err(WakeComputeError::TooManyConnections);
+        }
+
+        let node = permit.release_result(self.do_wake_compute(ctx, user_info).await);
+        match node {
+            Ok(node) => {
+                ctx.set_project(node.aux.clone());
+                debug!(key = &*key, "created a cache entry for woken compute node");
+
+                let mut stored_node = node.clone();
+                // store the cached node as 'warm_cached'
+                stored_node.aux.cold_start_info = ColdStartInfo::WarmCached;
+
+                let (_, cached) = self.caches.node_info.insert_unit(key, Ok(stored_node));
+
+                Ok(cached.map(|()| node))
+            }
+            Err(err) => match err {
+                WakeComputeError::ControlPlane(ControlPlaneError::Message(err)) => {
+                    let Some(status) = &err.status else {
+                        return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
+                            err,
+                        )));
+                    };
+
+                    let reason = status
+                        .details
+                        .error_info
+                        .map_or(Reason::Unknown, |x| x.reason);
+
+                    // if we can retry this error, do not cache it.
+                    if reason.can_retry() {
+                        return Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
+                            err,
+                        )));
+                    }
+
+                    // at this point, we should only have quota errors.
+                    debug!(
+                        key = &*key,
+                        "created a cache entry for the wake compute error"
+                    );
+
+                    self.caches.node_info.insert_ttl(
+                        key,
+                        Err(err.clone()),
+                        Duration::from_secs(30),
+                    );
+
+                    Err(WakeComputeError::ControlPlane(ControlPlaneError::Message(
+                        err,
+                    )))
+                }
+                err => return Err(err),
+            },
+        }
+    }
+}
+
+/// Parse http response body, taking status code into account.
+async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
+    response: http::Response,
+) -> Result<T, ControlPlaneError> {
+    let status = response.status();
+    if status.is_success() {
+        // We shouldn't log raw body because it may contain secrets.
+        info!("request succeeded, processing the body");
+        return Ok(response.json().await?);
+    }
+    let s = response.bytes().await?;
+    // Log plaintext to be able to detect, whether there are some cases not covered by the error struct.
+    info!("response_error plaintext: {:?}", s);
+
+    // Don't throw an error here because it's not as important
+    // as the fact that the request itself has failed.
+    let mut body = serde_json::from_slice(&s).unwrap_or_else(|e| {
+        warn!("failed to parse error body: {e}");
+        ControlPlaneErrorMessage {
+            error: "reason unclear (malformed error message)".into(),
+            http_status_code: status,
+            status: None,
+        }
+    });
+    body.http_status_code = status;
+
+    warn!("console responded with an error ({status}): {body:?}");
+    Err(ControlPlaneError::Message(Box::new(body)))
+}
+
+fn parse_host_port(input: &str) -> Option<(&str, u16)> {
+    let (host, port) = input.rsplit_once(':')?;
+    let ipv6_brackets: &[_] = &['[', ']'];
+    Some((host.trim_matches(ipv6_brackets), port.parse().ok()?))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_parse_host_port_v4() {
+        let (host, port) = parse_host_port("127.0.0.1:5432").expect("failed to parse");
+        assert_eq!(host, "127.0.0.1");
+        assert_eq!(port, 5432);
+    }
+
+    #[test]
+    fn test_parse_host_port_v6() {
+        let (host, port) = parse_host_port("[2001:db8::1]:5432").expect("failed to parse");
+        assert_eq!(host, "2001:db8::1");
+        assert_eq!(port, 5432);
+    }
+
+    #[test]
+    fn test_parse_host_port_url() {
+        let (host, port) = parse_host_port("compute-foo-bar-1234.default.svc.cluster.local:5432")
+            .expect("failed to parse");
+        assert_eq!(host, "compute-foo-bar-1234.default.svc.cluster.local");
+        assert_eq!(port, 5432);
+    }
+}
diff --git a/proxy/src/control_plane/client/mod.rs b/proxy/src/control_plane/client/mod.rs
index f8f74372f0a2..7ef5a9c9fd68 100644
--- a/proxy/src/control_plane/client/mod.rs
+++ b/proxy/src/control_plane/client/mod.rs
@@ -1,3 +1,4 @@
+pub mod cplane_proxy_v1;
 #[cfg(any(test, feature = "testing"))]
 pub mod mock;
 pub mod neon;
@@ -27,6 +28,8 @@ use crate::types::EndpointId;
 #[non_exhaustive]
 #[derive(Clone)]
 pub enum ControlPlaneClient {
+    /// New Proxy V1 control plane API
+    ProxyV1(cplane_proxy_v1::NeonControlPlaneClient),
     /// Current Management API (V2).
     Neon(neon::NeonControlPlaneClient),
     /// Local mock control plane.
@@ -45,6 +48,7 @@ impl ControlPlaneApi for ControlPlaneClient {
         user_info: &ComputeUserInfo,
     ) -> Result<CachedRoleSecret, errors::GetAuthInfoError> {
         match self {
+            Self::ProxyV1(api) => api.get_role_secret(ctx, user_info).await,
             Self::Neon(api) => api.get_role_secret(ctx, user_info).await,
             #[cfg(any(test, feature = "testing"))]
             Self::PostgresMock(api) => api.get_role_secret(ctx, user_info).await,
@@ -61,6 +65,7 @@ impl ControlPlaneApi for ControlPlaneClient {
         user_info: &ComputeUserInfo,
     ) -> Result<(CachedAllowedIps, Option<CachedRoleSecret>), errors::GetAuthInfoError> {
         match self {
+            Self::ProxyV1(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
             Self::Neon(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
             #[cfg(any(test, feature = "testing"))]
             Self::PostgresMock(api) => api.get_allowed_ips_and_secret(ctx, user_info).await,
@@ -75,6 +80,7 @@ impl ControlPlaneApi for ControlPlaneClient {
         endpoint: EndpointId,
     ) -> Result<Vec<AuthRule>, errors::GetEndpointJwksError> {
         match self {
+            Self::ProxyV1(api) => api.get_endpoint_jwks(ctx, endpoint).await,
             Self::Neon(api) => api.get_endpoint_jwks(ctx, endpoint).await,
             #[cfg(any(test, feature = "testing"))]
             Self::PostgresMock(api) => api.get_endpoint_jwks(ctx, endpoint).await,
@@ -89,6 +95,7 @@ impl ControlPlaneApi for ControlPlaneClient {
         user_info: &ComputeUserInfo,
     ) -> Result<CachedNodeInfo, errors::WakeComputeError> {
         match self {
+            Self::ProxyV1(api) => api.wake_compute(ctx, user_info).await,
             Self::Neon(api) => api.wake_compute(ctx, user_info).await,
             #[cfg(any(test, feature = "testing"))]
             Self::PostgresMock(api) => api.wake_compute(ctx, user_info).await,
diff --git a/proxy/src/control_plane/client/neon.rs b/proxy/src/control_plane/client/neon.rs
index 5c204ae1d700..bf62c0d6abd3 100644
--- a/proxy/src/control_plane/client/neon.rs
+++ b/proxy/src/control_plane/client/neon.rs
@@ -1,4 +1,4 @@
-//! Production console backend.
+//! Stale console backend, remove after migrating to Proxy V1 API (#15245).
 
 use std::sync::Arc;
 use std::time::Duration;
diff --git a/proxy/src/control_plane/messages.rs b/proxy/src/control_plane/messages.rs
index 8762ba874bdf..2662ab85f96f 100644
--- a/proxy/src/control_plane/messages.rs
+++ b/proxy/src/control_plane/messages.rs
@@ -230,6 +230,16 @@ pub(crate) struct GetRoleSecret {
     pub(crate) project_id: Option<ProjectIdInt>,
 }
 
+/// Response which holds client's auth secret, e.g. [`crate::scram::ServerSecret`].
+/// Returned by the `/get_endpoint_access_control` API method.
+#[derive(Deserialize)]
+pub(crate) struct GetEndpointAccessControl {
+    pub(crate) role_secret: Box<str>,
+    pub(crate) allowed_ips: Option<Vec<IpPattern>>,
+    pub(crate) project_id: Option<ProjectIdInt>,
+    pub(crate) allowed_vpc_endpoint_ids: Option<Vec<EndpointIdInt>>,
+}
+
 // Manually implement debug to omit sensitive info.
 impl fmt::Debug for GetRoleSecret {
     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {

From db793044167d1b81cea7f2a2a57a189711d0d683 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Thu, 5 Dec 2024 18:29:21 +0100
Subject: [PATCH 45/65] storage_controller: increase shard scan timeout
 (#10000)

## Problem

The node shard scan timeout of 1 second is a bit too aggressive, and
we've seen this cause test failures. The scans are performed in parallel
across nodes, and the entire operation has a 15 second timeout.

Resolves #9801.

## Summary of changes

Increase the timeout to 5 seconds. This is still enough to time out on a
network failure and retry successfully within 15 seconds.
---
 storage_controller/src/service.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 92ec58cb4d68..083c78233a8a 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -789,7 +789,7 @@ impl Service {
             node_list_futs.push({
                 async move {
                     tracing::info!("Scanning shards on node {node}...");
-                    let timeout = Duration::from_secs(1);
+                    let timeout = Duration::from_secs(5);
                     let response = node
                         .with_client_retries(
                             |client| async move { client.list_location_config().await },

From 13e810574029953ab4f5002724ad853fc2c39922 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Thu, 5 Dec 2024 18:57:25 +0100
Subject: [PATCH 46/65] feat(compute): Allow specifying the reconfiguration
 concurrency (#10006)

## Problem

We need a higher concurrency during reconfiguration in case of many DBs,
but the instance is already running and used by the client. We can
easily get out of `max_connections` limit, and the current code won't
handle that.

## Summary of changes

Default to 1, but also allow control plane to override this value for
specific projects. It's also recommended to bump
`superuser_reserved_connections` += `reconfigure_concurrency` for such
projects to ensure that we always have enough spare connections for
reconfiguration process to succeed.

Quick workaround for neondatabase/cloud#17846
---
 compute_tools/src/compute.rs  |  7 +------
 control_plane/src/endpoint.rs |  1 +
 libs/compute_api/src/spec.rs  | 25 ++++++++++++++++++++++---
 3 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 0d1e6d680fe9..d72a04f2f979 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1243,12 +1243,7 @@ impl ComputeNode {
         let postgresql_conf_path = pgdata_path.join("postgresql.conf");
         config::write_postgres_conf(&postgresql_conf_path, &spec, self.http_port)?;
 
-        // TODO(ololobus): We need a concurrency during reconfiguration as well,
-        // but DB is already running and used by user. We can easily get out of
-        // `max_connections` limit, and the current code won't handle that.
-        // let compute_state = self.state.lock().unwrap().clone();
-        // let max_concurrent_connections = self.max_service_connections(&compute_state, &spec);
-        let max_concurrent_connections = 1;
+        let max_concurrent_connections = spec.reconfigure_concurrency;
 
         // Temporarily reset max_cluster_size in config
         // to avoid the possibility of hitting the limit, while we are reconfiguring:
diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 1ca6dc43c4cf..360857f365ff 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -618,6 +618,7 @@ impl Endpoint {
             pgbouncer_settings: None,
             shard_stripe_size: Some(shard_stripe_size),
             local_proxy_config: None,
+            reconfigure_concurrency: 1,
         };
         let spec_path = self.endpoint_path().join("spec.json");
         std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 8a447563dcf2..6d9c353cda1b 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -19,6 +19,10 @@ pub type PgIdent = String;
 /// String type alias representing Postgres extension version
 pub type ExtVersion = String;
 
+fn default_reconfigure_concurrency() -> usize {
+    1
+}
+
 /// Cluster spec or configuration represented as an optional number of
 /// delta operations + final cluster state description.
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
@@ -67,7 +71,7 @@ pub struct ComputeSpec {
     pub cluster: Cluster,
     pub delta_operations: Option<Vec<DeltaOp>>,
 
-    /// An optinal hint that can be passed to speed up startup time if we know
+    /// An optional hint that can be passed to speed up startup time if we know
     /// that no pg catalog mutations (like role creation, database creation,
     /// extension creation) need to be done on the actual database to start.
     #[serde(default)] // Default false
@@ -86,9 +90,7 @@ pub struct ComputeSpec {
     // etc. GUCs in cluster.settings. TODO: Once the control plane has been
     // updated to fill these fields, we can make these non optional.
     pub tenant_id: Option<TenantId>,
-
     pub timeline_id: Option<TimelineId>,
-
     pub pageserver_connstring: Option<String>,
 
     #[serde(default)]
@@ -113,6 +115,20 @@ pub struct ComputeSpec {
     /// Local Proxy configuration used for JWT authentication
     #[serde(default)]
     pub local_proxy_config: Option<LocalProxySpec>,
+
+    /// Number of concurrent connections during the parallel RunInEachDatabase
+    /// phase of the apply config process.
+    ///
+    /// We need a higher concurrency during reconfiguration in case of many DBs,
+    /// but instance is already running and used by client. We can easily get out of
+    /// `max_connections` limit, and the current code won't handle that.
+    ///
+    /// Default is 1, but also allow control plane to override this value for specific
+    /// projects. It's also recommended to bump `superuser_reserved_connections` +=
+    /// `reconfigure_concurrency` for such projects to ensure that we always have
+    /// enough spare connections for reconfiguration process to succeed.
+    #[serde(default = "default_reconfigure_concurrency")]
+    pub reconfigure_concurrency: usize,
 }
 
 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
@@ -315,6 +331,9 @@ mod tests {
 
         // Features list defaults to empty vector.
         assert!(spec.features.is_empty());
+
+        // Reconfigure concurrency defaults to 1.
+        assert_eq!(spec.reconfigure_concurrency, 1);
     }
 
     #[test]

From c0ba4169676300c72ec3b567996c2604be93b136 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 5 Dec 2024 13:04:33 -0600
Subject: [PATCH 47/65] Add compute_logical_snapshots_bytes metric (#9887)

This metric exposes the size of all non-temporary logical snapshot
files.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 compute/etc/neon_collector.jsonnet              |  1 +
 .../compute_logical_snapshots_bytes.15.sql      |  7 +++++++
 .../compute_logical_snapshots_bytes.libsonnet   | 17 +++++++++++++++++
 .../compute_logical_snapshots_bytes.sql         |  9 +++++++++
 4 files changed, 34 insertions(+)
 create mode 100644 compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql
 create mode 100644 compute/etc/sql_exporter/compute_logical_snapshots_bytes.libsonnet
 create mode 100644 compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql

diff --git a/compute/etc/neon_collector.jsonnet b/compute/etc/neon_collector.jsonnet
index 75d69c7b6888..aa6cc1cfc8a9 100644
--- a/compute/etc/neon_collector.jsonnet
+++ b/compute/etc/neon_collector.jsonnet
@@ -6,6 +6,7 @@
     import 'sql_exporter/compute_backpressure_throttling_seconds.libsonnet',
     import 'sql_exporter/compute_current_lsn.libsonnet',
     import 'sql_exporter/compute_logical_snapshot_files.libsonnet',
+    import 'sql_exporter/compute_logical_snapshots_bytes.libsonnet',
     import 'sql_exporter/compute_max_connections.libsonnet',
     import 'sql_exporter/compute_receive_lsn.libsonnet',
     import 'sql_exporter/compute_subscriptions_count.libsonnet',
diff --git a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql
new file mode 100644
index 000000000000..73a9c114053d
--- /dev/null
+++ b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.15.sql
@@ -0,0 +1,7 @@
+SELECT
+  (SELECT current_setting('neon.timeline_id')) AS timeline_id,
+  -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
+  -- These temporary snapshot files are renamed to the actual snapshot files
+  -- after they are completely built. We only WAL-log the completely built
+  -- snapshot files
+  (SELECT COALESCE(sum(size), 0) FROM pg_ls_logicalsnapdir() WHERE name LIKE '%.snap') AS logical_snapshots_bytes;
diff --git a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.libsonnet b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.libsonnet
new file mode 100644
index 000000000000..8e1792d386ed
--- /dev/null
+++ b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.libsonnet
@@ -0,0 +1,17 @@
+local neon = import 'neon.libsonnet';
+
+local pg_ls_logicalsnapdir = importstr 'sql_exporter/compute_logical_snapshots_bytes.15.sql';
+local pg_ls_dir = importstr 'sql_exporter/compute_logical_snapshots_bytes.sql';
+
+{
+  metric_name: 'compute_logical_snapshots_bytes',
+  type: 'gauge',
+  help: 'Size of the pg_logical/snapshots directory, not including temporary files',
+  key_labels: [
+    'timeline_id',
+  ],
+  values: [
+    'logical_snapshots_bytes',
+  ],
+  query: if neon.PG_MAJORVERSION_NUM < 15 then pg_ls_dir else pg_ls_logicalsnapdir,
+}
diff --git a/compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql
new file mode 100644
index 000000000000..16da899de28c
--- /dev/null
+++ b/compute/etc/sql_exporter/compute_logical_snapshots_bytes.sql
@@ -0,0 +1,9 @@
+SELECT
+  (SELECT setting FROM pg_settings WHERE name = 'neon.timeline_id') AS timeline_id,
+  -- Postgres creates temporary snapshot files of the form %X-%X.snap.%d.tmp.
+  -- These temporary snapshot files are renamed to the actual snapshot files
+  -- after they are completely built. We only WAL-log the completely built
+  -- snapshot files
+  (SELECT COALESCE(sum((pg_stat_file('pg_logical/snapshots/' || name, missing_ok => true)).size), 0)
+    FROM (SELECT * FROM pg_ls_dir('pg_logical/snapshots') WHERE pg_ls_dir LIKE '%.snap') AS name
+  ) AS logical_snapshots_bytes;

From 71f38d135467ef8691f062c62fa5d8f3bf49ea6d Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Thu, 5 Dec 2024 11:37:17 -0800
Subject: [PATCH 48/65] feat(pageserver): support schedule gc-compaction
 (#9809)

## Problem

part of https://github.com/neondatabase/neon/issues/9114

gc-compaction can take a long time. This patch adds support for
scheduling a gc-compaction job. The compaction loop will first handle
L0->L1 compaction, and then gc compaction. The scheduled jobs are stored
in a non-persistent queue within the tenant structure.

This will be the building block for the partial compaction trigger -- if
the system determines that we need to do a gc compaction, it will
partition the keyspace and schedule several jobs. Each of these jobs
will run for a short amount of time (i.e, 1 min). L0 compaction will be
prioritized over gc compaction.

## Summary of changes

* Add compaction scheduler in tenant.
* Run scheduled compaction in integration tests.
* Change the manual compaction API to allow schedule a compaction
instead of immediately doing it.
* Add LSN upper bound as gc-compaction parameter. If we schedule partial
compactions, gc_cutoff might move across different runs. Therefore, we
need to pass a pre-determined gc_cutoff beforehand. (TODO: support LSN
lower bound so that we can compact arbitrary "rectangle" in the layer
map)
* Refactor the gc_compaction internal interface.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 pageserver/src/http/routes.rs                |  66 +++++--
 pageserver/src/tenant.rs                     | 171 ++++++++++++++++---
 pageserver/src/tenant/timeline.rs            |  29 +++-
 pageserver/src/tenant/timeline/compaction.rs |  58 ++++---
 test_runner/regress/test_compaction.py       |  41 +++--
 5 files changed, 291 insertions(+), 74 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index e04f1460a8f2..b3981b4a8e7d 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -87,7 +87,7 @@ use crate::tenant::timeline::offload::offload_timeline;
 use crate::tenant::timeline::offload::OffloadError;
 use crate::tenant::timeline::CompactFlags;
 use crate::tenant::timeline::CompactOptions;
-use crate::tenant::timeline::CompactRange;
+use crate::tenant::timeline::CompactRequest;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::timeline::Timeline;
 use crate::tenant::GetTimelineError;
@@ -1978,6 +1978,26 @@ async fn timeline_gc_handler(
     json_response(StatusCode::OK, gc_result)
 }
 
+// Cancel scheduled compaction tasks
+async fn timeline_cancel_compact_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+    let state = get_state(&request);
+    async {
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id)?;
+        tenant.cancel_scheduled_compaction(timeline_id);
+        json_response(StatusCode::OK, ())
+    }
+    .instrument(info_span!("timeline_cancel_compact", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
+    .await
+}
+
 // Run compaction immediately on given timeline.
 async fn timeline_compact_handler(
     mut request: Request<Body>,
@@ -1987,7 +2007,7 @@ async fn timeline_compact_handler(
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
 
-    let compact_range = json_request_maybe::<Option<CompactRange>>(&mut request).await?;
+    let compact_request = json_request_maybe::<Option<CompactRequest>>(&mut request).await?;
 
     let state = get_state(&request);
 
@@ -2012,22 +2032,42 @@ async fn timeline_compact_handler(
     let wait_until_uploaded =
         parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
 
+    let wait_until_scheduled_compaction_done =
+        parse_query_param::<_, bool>(&request, "wait_until_scheduled_compaction_done")?
+            .unwrap_or(false);
+
     let options = CompactOptions {
-        compact_range,
+        compact_range: compact_request
+            .as_ref()
+            .and_then(|r| r.compact_range.clone()),
+        compact_below_lsn: compact_request.as_ref().and_then(|r| r.compact_below_lsn),
         flags,
     };
 
+    let scheduled = compact_request.map(|r| r.scheduled).unwrap_or(false);
+
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
         let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
-        timeline
-            .compact_with_options(&cancel, options, &ctx)
-            .await
-            .map_err(|e| ApiError::InternalServerError(e.into()))?;
-        if wait_until_uploaded {
-            timeline.remote_client.wait_completion().await
-            // XXX map to correct ApiError for the cases where it's due to shutdown
-            .context("wait completion").map_err(ApiError::InternalServerError)?;
+        if scheduled {
+            let tenant = state
+                .tenant_manager
+                .get_attached_tenant_shard(tenant_shard_id)?;
+            let rx = tenant.schedule_compaction(timeline_id, options).await;
+            if wait_until_scheduled_compaction_done {
+                // It is possible that this will take a long time, dropping the HTTP request will not cancel the compaction.
+                rx.await.ok();
+            }
+        } else {
+            timeline
+                .compact_with_options(&cancel, options, &ctx)
+                .await
+                .map_err(|e| ApiError::InternalServerError(e.into()))?;
+            if wait_until_uploaded {
+                timeline.remote_client.wait_completion().await
+                // XXX map to correct ApiError for the cases where it's due to shutdown
+                .context("wait completion").map_err(ApiError::InternalServerError)?;
+            }
         }
         json_response(StatusCode::OK, ())
     }
@@ -3301,6 +3341,10 @@ pub fn make_router(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
             |r| api_handler(r, timeline_compact_handler),
         )
+        .delete(
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
+            |r| api_handler(r, timeline_cancel_compact_handler),
+        )
         .put(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/offload",
             |r| testing_api_handler("attempt timeline offload", r, timeline_offload_handler),
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 5a9e398586f6..306ec9f5486e 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -37,14 +37,18 @@ use remote_timeline_client::manifest::{
 };
 use remote_timeline_client::UploadQueueNotReadyError;
 use std::collections::BTreeMap;
+use std::collections::VecDeque;
 use std::fmt;
 use std::future::Future;
 use std::sync::atomic::AtomicBool;
 use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
+use timeline::compaction::ScheduledCompactionTask;
 use timeline::import_pgdata;
 use timeline::offload::offload_timeline;
+use timeline::CompactFlags;
+use timeline::CompactOptions;
 use timeline::ShutdownMode;
 use tokio::io::BufReader;
 use tokio::sync::watch;
@@ -339,6 +343,11 @@ pub struct Tenant {
     /// Overhead of mutex is acceptable because compaction is done with a multi-second period.
     compaction_circuit_breaker: std::sync::Mutex<CircuitBreaker>,
 
+    /// Scheduled compaction tasks. Currently, this can only be populated by triggering
+    /// a manual gc-compaction from the manual compaction API.
+    scheduled_compaction_tasks:
+        std::sync::Mutex<HashMap<TimelineId, VecDeque<ScheduledCompactionTask>>>,
+
     /// If the tenant is in Activating state, notify this to encourage it
     /// to proceed to Active as soon as possible, rather than waiting for lazy
     /// background warmup.
@@ -2953,27 +2962,68 @@ impl Tenant {
 
         for (timeline_id, timeline, (can_compact, can_offload)) in &timelines_to_compact_or_offload
         {
+            // pending_task_left == None: cannot compact, maybe still pending tasks
+            // pending_task_left == Some(true): compaction task left
+            // pending_task_left == Some(false): no compaction task left
             let pending_task_left = if *can_compact {
-                Some(
-                    timeline
-                        .compact(cancel, EnumSet::empty(), ctx)
-                        .instrument(info_span!("compact_timeline", %timeline_id))
-                        .await
-                        .inspect_err(|e| match e {
-                            timeline::CompactionError::ShuttingDown => (),
-                            timeline::CompactionError::Offload(_) => {
-                                // Failures to offload timelines do not trip the circuit breaker, because
-                                // they do not do lots of writes the way compaction itself does: it is cheap
-                                // to retry, and it would be bad to stop all compaction because of an issue with offloading.
-                            }
-                            timeline::CompactionError::Other(e) => {
-                                self.compaction_circuit_breaker
-                                    .lock()
-                                    .unwrap()
-                                    .fail(&CIRCUIT_BREAKERS_BROKEN, e);
+                let has_pending_l0_compaction_task = timeline
+                    .compact(cancel, EnumSet::empty(), ctx)
+                    .instrument(info_span!("compact_timeline", %timeline_id))
+                    .await
+                    .inspect_err(|e| match e {
+                        timeline::CompactionError::ShuttingDown => (),
+                        timeline::CompactionError::Offload(_) => {
+                            // Failures to offload timelines do not trip the circuit breaker, because
+                            // they do not do lots of writes the way compaction itself does: it is cheap
+                            // to retry, and it would be bad to stop all compaction because of an issue with offloading.
+                        }
+                        timeline::CompactionError::Other(e) => {
+                            self.compaction_circuit_breaker
+                                .lock()
+                                .unwrap()
+                                .fail(&CIRCUIT_BREAKERS_BROKEN, e);
+                        }
+                    })?;
+                if has_pending_l0_compaction_task {
+                    Some(true)
+                } else {
+                    let has_pending_scheduled_compaction_task;
+                    let next_scheduled_compaction_task = {
+                        let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
+                        if let Some(tline_pending_tasks) = guard.get_mut(timeline_id) {
+                            let next_task = tline_pending_tasks.pop_front();
+                            has_pending_scheduled_compaction_task = !tline_pending_tasks.is_empty();
+                            next_task
+                        } else {
+                            has_pending_scheduled_compaction_task = false;
+                            None
+                        }
+                    };
+                    if let Some(mut next_scheduled_compaction_task) = next_scheduled_compaction_task
+                    {
+                        if !next_scheduled_compaction_task
+                            .options
+                            .flags
+                            .contains(CompactFlags::EnhancedGcBottomMostCompaction)
+                        {
+                            warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", next_scheduled_compaction_task.options);
+                        } else {
+                            let _ = timeline
+                                .compact_with_options(
+                                    cancel,
+                                    next_scheduled_compaction_task.options,
+                                    ctx,
+                                )
+                                .instrument(info_span!("scheduled_compact_timeline", %timeline_id))
+                                .await?;
+                            if let Some(tx) = next_scheduled_compaction_task.result_tx.take() {
+                                // TODO: we can send compaction statistics in the future
+                                tx.send(()).ok();
                             }
-                        })?,
-                )
+                        }
+                    }
+                    Some(has_pending_scheduled_compaction_task)
+                }
             } else {
                 None
             };
@@ -2993,6 +3043,36 @@ impl Tenant {
         Ok(has_pending_task)
     }
 
+    /// Cancel scheduled compaction tasks
+    pub(crate) fn cancel_scheduled_compaction(
+        &self,
+        timeline_id: TimelineId,
+    ) -> Vec<ScheduledCompactionTask> {
+        let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
+        if let Some(tline_pending_tasks) = guard.get_mut(&timeline_id) {
+            let current_tline_pending_tasks = std::mem::take(tline_pending_tasks);
+            current_tline_pending_tasks.into_iter().collect()
+        } else {
+            Vec::new()
+        }
+    }
+
+    /// Schedule a compaction task for a timeline.
+    pub(crate) async fn schedule_compaction(
+        &self,
+        timeline_id: TimelineId,
+        options: CompactOptions,
+    ) -> tokio::sync::oneshot::Receiver<()> {
+        let (tx, rx) = tokio::sync::oneshot::channel();
+        let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
+        let tline_pending_tasks = guard.entry(timeline_id).or_default();
+        tline_pending_tasks.push_back(ScheduledCompactionTask {
+            options,
+            result_tx: Some(tx),
+        });
+        rx
+    }
+
     // Call through to all timelines to freeze ephemeral layers if needed.  Usually
     // this happens during ingest: this background housekeeping is for freezing layers
     // that are open but haven't been written to for some time.
@@ -4005,6 +4085,7 @@ impl Tenant {
                 // use an extremely long backoff.
                 Some(Duration::from_secs(3600 * 24)),
             )),
+            scheduled_compaction_tasks: Mutex::new(Default::default()),
             activate_now_sem: tokio::sync::Semaphore::new(0),
             attach_wal_lag_cooldown: Arc::new(std::sync::OnceLock::new()),
             cancel: CancellationToken::default(),
@@ -9163,6 +9244,7 @@ mod tests {
                 CompactOptions {
                     flags: dryrun_flags,
                     compact_range: None,
+                    compact_below_lsn: None,
                 },
                 &ctx,
             )
@@ -9399,6 +9481,7 @@ mod tests {
                 CompactOptions {
                     flags: dryrun_flags,
                     compact_range: None,
+                    compact_below_lsn: None,
                 },
                 &ctx,
             )
@@ -9885,7 +9968,15 @@ mod tests {
 
         // Do a partial compaction on key range 0..2
         tline
-            .partial_compact_with_gc(get_key(0)..get_key(2), &cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    flags: EnumSet::new(),
+                    compact_range: Some((get_key(0)..get_key(2)).into()),
+                    compact_below_lsn: None,
+                },
+                &ctx,
+            )
             .await
             .unwrap();
         let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
@@ -9924,7 +10015,15 @@ mod tests {
 
         // Do a partial compaction on key range 2..4
         tline
-            .partial_compact_with_gc(get_key(2)..get_key(4), &cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    flags: EnumSet::new(),
+                    compact_range: Some((get_key(2)..get_key(4)).into()),
+                    compact_below_lsn: None,
+                },
+                &ctx,
+            )
             .await
             .unwrap();
         let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
@@ -9968,7 +10067,15 @@ mod tests {
 
         // Do a partial compaction on key range 4..9
         tline
-            .partial_compact_with_gc(get_key(4)..get_key(9), &cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    flags: EnumSet::new(),
+                    compact_range: Some((get_key(4)..get_key(9)).into()),
+                    compact_below_lsn: None,
+                },
+                &ctx,
+            )
             .await
             .unwrap();
         let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
@@ -10011,7 +10118,15 @@ mod tests {
 
         // Do a partial compaction on key range 9..10
         tline
-            .partial_compact_with_gc(get_key(9)..get_key(10), &cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    flags: EnumSet::new(),
+                    compact_range: Some((get_key(9)..get_key(10)).into()),
+                    compact_below_lsn: None,
+                },
+                &ctx,
+            )
             .await
             .unwrap();
         let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
@@ -10059,7 +10174,15 @@ mod tests {
 
         // Do a partial compaction on key range 0..10, all image layers below LSN 20 can be replaced with new ones.
         tline
-            .partial_compact_with_gc(get_key(0)..get_key(10), &cancel, EnumSet::new(), &ctx)
+            .compact_with_gc(
+                &cancel,
+                CompactOptions {
+                    flags: EnumSet::new(),
+                    compact_range: Some((get_key(0)..get_key(10)).into()),
+                    compact_below_lsn: None,
+                },
+                &ctx,
+            )
             .await
             .unwrap();
         let all_layers = inspect_and_sort(&tline, Some(get_key(0)..get_key(10))).await;
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index fc741826ab1e..fc69525bf4f7 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -768,7 +768,7 @@ pub enum GetLogicalSizePriority {
     Background,
 }
 
-#[derive(enumset::EnumSetType)]
+#[derive(Debug, enumset::EnumSetType)]
 pub(crate) enum CompactFlags {
     ForceRepartition,
     ForceImageLayerCreation,
@@ -777,6 +777,16 @@ pub(crate) enum CompactFlags {
     DryRun,
 }
 
+#[serde_with::serde_as]
+#[derive(Debug, Clone, serde::Deserialize)]
+pub(crate) struct CompactRequest {
+    pub compact_range: Option<CompactRange>,
+    pub compact_below_lsn: Option<Lsn>,
+    /// Whether the compaction job should be scheduled.
+    #[serde(default)]
+    pub scheduled: bool,
+}
+
 #[serde_with::serde_as]
 #[derive(Debug, Clone, serde::Deserialize)]
 pub(crate) struct CompactRange {
@@ -786,10 +796,24 @@ pub(crate) struct CompactRange {
     pub end: Key,
 }
 
-#[derive(Clone, Default)]
+impl From<Range<Key>> for CompactRange {
+    fn from(range: Range<Key>) -> Self {
+        CompactRange {
+            start: range.start,
+            end: range.end,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Default)]
 pub(crate) struct CompactOptions {
     pub flags: EnumSet<CompactFlags>,
+    /// If set, the compaction will only compact the key range specified by this option.
+    /// This option is only used by GC compaction.
     pub compact_range: Option<CompactRange>,
+    /// If set, the compaction will only compact the LSN below this value.
+    /// This option is only used by GC compaction.
+    pub compact_below_lsn: Option<Lsn>,
 }
 
 impl std::fmt::Debug for Timeline {
@@ -1604,6 +1628,7 @@ impl Timeline {
             CompactOptions {
                 flags,
                 compact_range: None,
+                compact_below_lsn: None,
             },
             ctx,
         )
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index ecd68ba55ec4..8ececa2bfb46 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -16,7 +16,6 @@ use super::{
 
 use anyhow::{anyhow, bail, Context};
 use bytes::Bytes;
-use enumset::EnumSet;
 use fail::fail_point;
 use itertools::Itertools;
 use pageserver_api::key::KEY_SIZE;
@@ -64,6 +63,12 @@ use super::CompactionError;
 /// Maximum number of deltas before generating an image layer in bottom-most compaction.
 const COMPACTION_DELTA_THRESHOLD: usize = 5;
 
+/// A scheduled compaction task.
+pub struct ScheduledCompactionTask {
+    pub options: CompactOptions,
+    pub result_tx: Option<tokio::sync::oneshot::Sender<()>>,
+}
+
 pub struct GcCompactionJobDescription {
     /// All layers to read in the compaction job
     selected_layers: Vec<Layer>,
@@ -1746,24 +1751,6 @@ impl Timeline {
         Ok(())
     }
 
-    pub(crate) async fn compact_with_gc(
-        self: &Arc<Self>,
-        cancel: &CancellationToken,
-        options: CompactOptions,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
-        self.partial_compact_with_gc(
-            options
-                .compact_range
-                .map(|range| range.start..range.end)
-                .unwrap_or_else(|| Key::MIN..Key::MAX),
-            cancel,
-            options.flags,
-            ctx,
-        )
-        .await
-    }
-
     /// An experimental compaction building block that combines compaction with garbage collection.
     ///
     /// The current implementation picks all delta + image layers that are below or intersecting with
@@ -1771,17 +1758,19 @@ impl Timeline {
     /// layers and image layers, which generates image layers on the gc horizon, drop deltas below gc horizon,
     /// and create delta layers with all deltas >= gc horizon.
     ///
-    /// If `key_range` is provided, it will only compact the keys within the range, aka partial compaction.
+    /// If `options.compact_range` is provided, it will only compact the keys within the range, aka partial compaction.
     /// Partial compaction will read and process all layers overlapping with the key range, even if it might
     /// contain extra keys. After the gc-compaction phase completes, delta layers that are not fully contained
     /// within the key range will be rewritten to ensure they do not overlap with the delta layers. Providing
     /// Key::MIN..Key..MAX to the function indicates a full compaction, though technically, `Key::MAX` is not
     /// part of the range.
-    pub(crate) async fn partial_compact_with_gc(
+    ///
+    /// If `options.compact_below_lsn` is provided, the compaction will only compact layers below or intersect with
+    /// the LSN. Otherwise, it will use the gc cutoff by default.
+    pub(crate) async fn compact_with_gc(
         self: &Arc<Self>,
-        compaction_key_range: Range<Key>,
         cancel: &CancellationToken,
-        flags: EnumSet<CompactFlags>,
+        options: CompactOptions,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         // Block other compaction/GC tasks from running for now. GC-compaction could run along
@@ -1803,6 +1792,12 @@ impl Timeline {
         )
         .await?;
 
+        let flags = options.flags;
+        let compaction_key_range = options
+            .compact_range
+            .map(|range| range.start..range.end)
+            .unwrap_or_else(|| Key::MIN..Key::MAX);
+
         let dry_run = flags.contains(CompactFlags::DryRun);
 
         if compaction_key_range == (Key::MIN..Key::MAX) {
@@ -1826,7 +1821,18 @@ impl Timeline {
             let layers = guard.layer_map()?;
             let gc_info = self.gc_info.read().unwrap();
             let mut retain_lsns_below_horizon = Vec::new();
-            let gc_cutoff = gc_info.cutoffs.select_min();
+            let gc_cutoff = {
+                let real_gc_cutoff = gc_info.cutoffs.select_min();
+                // The compaction algorithm will keep all keys above the gc_cutoff while keeping only necessary keys below the gc_cutoff for
+                // each of the retain_lsn. Therefore, if the user-provided `compact_below_lsn` is larger than the real gc cutoff, we will use
+                // the real cutoff.
+                let mut gc_cutoff = options.compact_below_lsn.unwrap_or(real_gc_cutoff);
+                if gc_cutoff > real_gc_cutoff {
+                    warn!("provided compact_below_lsn={} is larger than the real_gc_cutoff={}, using the real gc cutoff", gc_cutoff, real_gc_cutoff);
+                    gc_cutoff = real_gc_cutoff;
+                }
+                gc_cutoff
+            };
             for (lsn, _timeline_id, _is_offloaded) in &gc_info.retain_lsns {
                 if lsn < &gc_cutoff {
                     retain_lsns_below_horizon.push(*lsn);
@@ -1846,7 +1852,7 @@ impl Timeline {
                 .map(|desc| desc.get_lsn_range().end)
                 .max()
             else {
-                info!("no layers to compact with gc");
+                info!("no layers to compact with gc: no historic layers below gc_cutoff, gc_cutoff={}", gc_cutoff);
                 return Ok(());
             };
             // Then, pick all the layers that are below the max_layer_lsn. This is to ensure we can pick all single-key
@@ -1869,7 +1875,7 @@ impl Timeline {
                 }
             }
             if selected_layers.is_empty() {
-                info!("no layers to compact with gc");
+                info!("no layers to compact with gc: no layers within the key range, gc_cutoff={}, key_range={}..{}", gc_cutoff, compaction_key_range.start, compaction_key_range.end);
                 return Ok(());
             }
             retain_lsns_below_horizon.sort();
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index b6741aed68cb..de6653eb3f4e 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -15,7 +15,7 @@
 from fixtures.utils import skip_in_debug_build, wait_until
 from fixtures.workload import Workload
 
-AGGRESIVE_COMPACTION_TENANT_CONF = {
+AGGRESSIVE_COMPACTION_TENANT_CONF = {
     # Disable gc and compaction. The test runs compaction manually.
     "gc_period": "0s",
     "compaction_period": "0s",
@@ -24,6 +24,7 @@
     # Compact small layers
     "compaction_target_size": 1024**2,
     "image_creation_threshold": 2,
+    # "lsn_lease_length": "0s", -- TODO: would cause branch creation errors, should fix later
 }
 
 
@@ -51,7 +52,7 @@ def test_pageserver_compaction_smoke(
 page_cache_size=10
 """
 
-    env = neon_env_builder.init_start(initial_tenant_conf=AGGRESIVE_COMPACTION_TENANT_CONF)
+    env = neon_env_builder.init_start(initial_tenant_conf=AGGRESSIVE_COMPACTION_TENANT_CONF)
 
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
@@ -120,14 +121,25 @@ def test_pageserver_compaction_smoke(
     assert vectored_average < 8
 
 
+@skip_in_debug_build("only run with release build")
 def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
-    env = neon_env_builder.init_start(initial_tenant_conf=AGGRESIVE_COMPACTION_TENANT_CONF)
+    SMOKE_CONF = {
+        # Run both gc and gc-compaction.
+        "gc_period": "5s",
+        "compaction_period": "5s",
+        # No PiTR interval and small GC horizon
+        "pitr_interval": "0s",
+        "gc_horizon": f"{1024 ** 2}",
+        "lsn_lease_length": "0s",
+    }
+
+    env = neon_env_builder.init_start(initial_tenant_conf=SMOKE_CONF)
 
     tenant_id = env.initial_tenant
     timeline_id = env.initial_timeline
 
-    row_count = 1000
-    churn_rounds = 10
+    row_count = 10000
+    churn_rounds = 50
 
     ps_http = env.pageserver.http_client()
 
@@ -141,20 +153,27 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
         if i % 10 == 0:
             log.info(f"Running churn round {i}/{churn_rounds} ...")
 
-        workload.churn_rows(row_count, env.pageserver.id)
-        # Force L0 compaction to ensure the number of layers is within bounds, so that gc-compaction can run.
-        ps_http.timeline_compact(tenant_id, timeline_id, force_l0_compaction=True)
-        assert ps_http.perf_info(tenant_id, timeline_id)[0]["num_of_l0"] <= 1
         ps_http.timeline_compact(
             tenant_id,
             timeline_id,
             enhanced_gc_bottom_most_compaction=True,
             body={
-                "start": "000000000000000000000000000000000000",
-                "end": "030000000000000000000000000000000000",
+                "scheduled": True,
+                "compact_range": {
+                    "start": "000000000000000000000000000000000000",
+                    # skip the SLRU range for now -- it races with get-lsn-by-timestamp, TODO: fix this
+                    "end": "010000000000000000000000000000000000",
+                },
             },
         )
 
+        workload.churn_rows(row_count, env.pageserver.id)
+
+    # ensure gc_compaction is scheduled and it's actually running (instead of skipping due to no layers picked)
+    env.pageserver.assert_log_contains(
+        "scheduled_compact_timeline.*picked .* layers for compaction"
+    )
+
     log.info("Validating at workload end ...")
     workload.validate(env.pageserver.id)
 

From 6331cb216195658b7926cadb8045759aa71c4575 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 5 Dec 2024 13:42:52 -0600
Subject: [PATCH 49/65] Bump anyhow to 1.0.94 (#10028)

We were over a year out of date.

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 Cargo.lock | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 62f06d45bd52..f6e0024d874c 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -133,9 +133,9 @@ dependencies = [
 
 [[package]]
 name = "anyhow"
-version = "1.0.71"
+version = "1.0.94"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "9c7d0618f0e0b7e8ff11427422b64564d5fb0be1940354bfe2e0529b18a9d9b8"
+checksum = "c1fd03a028ef38ba2276dce7e33fcd6369c158a1bca17946c4b1b701891c1ff7"
 dependencies = [
  "backtrace",
 ]

From 6ff4175fd7e62577ad0a7d1bba4fc3b6237ac764 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Thu, 5 Dec 2024 14:30:35 -0600
Subject: [PATCH 50/65] Send Content-Type header on reconfigure request from
 neon_local (#10029)

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 control_plane/src/endpoint.rs | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 360857f365ff..35067c95b6d0 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -53,6 +53,7 @@ use compute_api::spec::Role;
 use nix::sys::signal::kill;
 use nix::sys::signal::Signal;
 use pageserver_api::shard::ShardStripeSize;
+use reqwest::header::CONTENT_TYPE;
 use serde::{Deserialize, Serialize};
 use url::Host;
 use utils::id::{NodeId, TenantId, TimelineId};
@@ -818,6 +819,7 @@ impl Endpoint {
                 self.http_address.ip(),
                 self.http_address.port()
             ))
+            .header(CONTENT_TYPE.as_str(), "application/json")
             .body(format!(
                 "{{\"spec\":{}}}",
                 serde_json::to_string_pretty(&spec)?

From d1ab7471e2d6603a5680ba33f749adb743c2154b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Thu, 5 Dec 2024 21:51:57 +0100
Subject: [PATCH 51/65] Fix desc_str for Azure container (#10021)

Small logs fix I've noticed while working on
https://github.com/neondatabase/cloud/issues/19963 .
---
 storage_scrubber/src/lib.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/storage_scrubber/src/lib.rs b/storage_scrubber/src/lib.rs
index 1fe4fc58cd0e..be526daaf0d1 100644
--- a/storage_scrubber/src/lib.rs
+++ b/storage_scrubber/src/lib.rs
@@ -268,7 +268,7 @@ impl BucketConfig {
                 config.bucket_name, config.bucket_region
             ),
             RemoteStorageKind::AzureContainer(config) => format!(
-                "bucket {}, storage account {:?}, region {}",
+                "container {}, storage account {:?}, region {}",
                 config.container_name, config.storage_account, config.container_region
             ),
         }

From 56f867bde5324b0d3333faaf7360aa07245f68c0 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 6 Dec 2024 08:22:22 +0100
Subject: [PATCH 52/65] pageserver: only zero truncated FSM page on owning
 shard (#10032)

## Problem

FSM pages are managed like regular relation pages, and owned by a single
shard. However, when truncating the FSM relation the last FSM page was
zeroed out on all shards. This is unnecessary and potentially confusing.

The superfluous keys will be removed during compactions, as they do not
belong on these shards.

Resolves #10027.

## Summary of changes

Only zero out the truncated FSM page on the owning shard.
---
 pageserver/src/walingest.rs | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 93ae88936f60..30c8965d517d 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -582,18 +582,21 @@ impl WalIngest {
                 forknum: FSM_FORKNUM,
             };
 
+            // Zero out the last remaining FSM page, if this shard owns it. We are not precise here,
+            // and instead of digging in the FSM bitmap format we just clear the whole page.
             let fsm_logical_page_no = blkno / pg_constants::SLOTS_PER_FSM_PAGE;
             let mut fsm_physical_page_no = fsm_logical_to_physical(fsm_logical_page_no);
-            if blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0 {
-                // Tail of last remaining FSM page has to be zeroed.
-                // We are not precise here and instead of digging in FSM bitmap format just clear the whole page.
+            if blkno % pg_constants::SLOTS_PER_FSM_PAGE != 0
+                && self
+                    .shard
+                    .is_key_local(&rel_block_to_key(rel, fsm_physical_page_no))
+            {
                 modification.put_rel_page_image_zero(rel, fsm_physical_page_no)?;
                 fsm_physical_page_no += 1;
             }
-            // TODO: re-examine the None case here wrt. sharding; should we error?
+            // Truncate this shard's view of the FSM relation size, if it even has one.
             let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0);
             if nblocks > fsm_physical_page_no {
-                // check if something to do: FSM is larger than truncate position
                 self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx)
                     .await?;
             }
@@ -617,7 +620,7 @@ impl WalIngest {
             // tail bits in the last remaining map page, representing truncated heap
             // blocks, need to be cleared. This is not only tidy, but also necessary
             // because we don't get a chance to clear the bits if the heap is extended
-            // again.
+            // again. Only do this on the shard that owns the page.
             if (trunc_byte != 0 || trunc_offs != 0)
                 && self.shard.is_key_local(&rel_block_to_key(rel, vm_page_no))
             {
@@ -631,10 +634,9 @@ impl WalIngest {
                 )?;
                 vm_page_no += 1;
             }
-            // TODO: re-examine the None case here wrt. sharding; should we error?
+            // Truncate this shard's view of the VM relation size, if it even has one.
             let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0);
             if nblocks > vm_page_no {
-                // check if something to do: VM is larger than truncate position
                 self.put_rel_truncation(modification, rel, vm_page_no, ctx)
                     .await?;
             }

From ec4072f84577eeb2a92d97fa77281efe50325730 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 6 Dec 2024 11:12:39 +0100
Subject: [PATCH 53/65] pageserver: add `wait_until_flushed` parameter for
 timeline checkpoint (#10013)

## Problem

I'm writing an ingest benchmark in #9812. To time S3 uploads, I need to
schedule a flush of the Pageserver's in-memory layer, but don't actually
want to wait around for it to complete (which will take a minute).

## Summary of changes

Add a parameter `wait_until_flush` (default `true`) for
`timeline/checkpoint` to control whether to wait for the flush to
complete.
---
 pageserver/src/http/routes.rs           | 12 ++++++++----
 pageserver/src/tenant/timeline.rs       | 26 ++++++++++++++++---------
 test_runner/fixtures/pageserver/http.py |  5 ++++-
 3 files changed, 29 insertions(+), 14 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index b3981b4a8e7d..b7fddb065c61 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2148,16 +2148,20 @@ async fn timeline_checkpoint_handler(
     // By default, checkpoints come with a compaction, but this may be optionally disabled by tests that just want to flush + upload.
     let compact = parse_query_param::<_, bool>(&request, "compact")?.unwrap_or(true);
 
+    let wait_until_flushed: bool =
+        parse_query_param(&request, "wait_until_flushed")?.unwrap_or(true);
+
     let wait_until_uploaded =
         parse_query_param::<_, bool>(&request, "wait_until_uploaded")?.unwrap_or(false);
 
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
         let timeline = active_timeline_of_active_tenant(&state.tenant_manager, tenant_shard_id, timeline_id).await?;
-        timeline
-            .freeze_and_flush()
-            .await
-            .map_err(|e| {
+        if wait_until_flushed {
+            timeline.freeze_and_flush().await
+        } else {
+            timeline.freeze().await.and(Ok(()))
+        }.map_err(|e| {
                 match e {
                     tenant::timeline::FlushLayerError::Cancelled => ApiError::ShuttingDown,
                     other => ApiError::InternalServerError(other.into()),
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index fc69525bf4f7..aab6703a3c2b 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1457,23 +1457,31 @@ impl Timeline {
         Ok(lease)
     }
 
-    /// Flush to disk all data that was written with the put_* functions
+    /// Freeze the current open in-memory layer. It will be written to disk on next iteration.
+    /// Returns the flush request ID which can be awaited with wait_flush_completion().
+    #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
+    pub(crate) async fn freeze(&self) -> Result<u64, FlushLayerError> {
+        self.freeze0().await
+    }
+
+    /// Freeze and flush the open in-memory layer, waiting for it to be written to disk.
     #[instrument(skip(self), fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%self.timeline_id))]
     pub(crate) async fn freeze_and_flush(&self) -> Result<(), FlushLayerError> {
         self.freeze_and_flush0().await
     }
 
+    /// Freeze the current open in-memory layer. It will be written to disk on next iteration.
+    /// Returns the flush request ID which can be awaited with wait_flush_completion().
+    pub(crate) async fn freeze0(&self) -> Result<u64, FlushLayerError> {
+        let mut g = self.write_lock.lock().await;
+        let to_lsn = self.get_last_record_lsn();
+        self.freeze_inmem_layer_at(to_lsn, &mut g).await
+    }
+
     // This exists to provide a non-span creating version of `freeze_and_flush` we can call without
     // polluting the span hierarchy.
     pub(crate) async fn freeze_and_flush0(&self) -> Result<(), FlushLayerError> {
-        let token = {
-            // Freeze the current open in-memory layer. It will be written to disk on next
-            // iteration.
-            let mut g = self.write_lock.lock().await;
-
-            let to_lsn = self.get_last_record_lsn();
-            self.freeze_inmem_layer_at(to_lsn, &mut g).await?
-        };
+        let token = self.freeze0().await?;
         self.wait_flush_completion(token).await
     }
 
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 4cf3ece39634..0832eac22f2d 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -850,6 +850,7 @@ def timeline_checkpoint(
         force_repartition=False,
         force_image_layer_creation=False,
         force_l0_compaction=False,
+        wait_until_flushed=True,
         wait_until_uploaded=False,
         compact: bool | None = None,
         **kwargs,
@@ -862,6 +863,8 @@ def timeline_checkpoint(
             query["force_image_layer_creation"] = "true"
         if force_l0_compaction:
             query["force_l0_compaction"] = "true"
+        if not wait_until_flushed:
+            query["wait_until_flushed"] = "false"
         if wait_until_uploaded:
             query["wait_until_uploaded"] = "true"
 
@@ -869,7 +872,7 @@ def timeline_checkpoint(
             query["compact"] = "true" if compact else "false"
 
         log.info(
-            f"Requesting checkpoint: tenant {tenant_id}, timeline {timeline_id}, wait_until_uploaded={wait_until_uploaded}"
+            f"Requesting checkpoint: tenant={tenant_id} timeline={timeline_id} wait_until_flushed={wait_until_flushed} wait_until_uploaded={wait_until_uploaded} compact={compact}"
         )
         res = self.put(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}/checkpoint",

From 3f1c5429577ca1dee8c5e0955e4072cee2a13eca Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 6 Dec 2024 10:21:52 +0000
Subject: [PATCH 54/65] pageserver: add disk consistent and remote lsn metrics
 (#10005)

## Problem

There's no metrics for disk consistent LSN and remote LSN. This stuff is
useful when looking at ingest performance.

## Summary of changes

Two per timeline metrics are added: `pageserver_disk_consistent_lsn` and
`pageserver_projected_remote_consistent_lsn`. I went for the projected
remote lsn instead of the visible one
because that more closely matches remote storage write tput. Ideally we
would have both, but these metrics are expensive.
---
 pageserver/src/metrics.rs                     | 46 +++++++++++++++++--
 .../src/tenant/remote_timeline_client.rs      |  3 ++
 pageserver/src/tenant/timeline.rs             |  8 +++-
 test_runner/fixtures/metrics.py               |  2 +
 4 files changed, 54 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 998c15ccaf2c..e3b6f43bc4db 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -464,6 +464,24 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+static DISK_CONSISTENT_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
+    register_int_gauge_vec!(
+        "pageserver_disk_consistent_lsn",
+        "Disk consistent LSN grouped by timeline",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
+pub(crate) static PROJECTED_REMOTE_CONSISTENT_LSN: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_projected_remote_consistent_lsn",
+        "Projected remote consistent LSN grouped by timeline",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 static PITR_HISTORY_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
         "pageserver_pitr_history_size",
@@ -2394,7 +2412,8 @@ pub(crate) struct TimelineMetrics {
     pub load_layer_map_histo: StorageTimeMetrics,
     pub garbage_collect_histo: StorageTimeMetrics,
     pub find_gc_cutoffs_histo: StorageTimeMetrics,
-    pub last_record_gauge: IntGauge,
+    pub last_record_lsn_gauge: IntGauge,
+    pub disk_consistent_lsn_gauge: IntGauge,
     pub pitr_history_size: UIntGauge,
     pub archival_size: UIntGauge,
     pub(crate) layer_size_image: UIntGauge,
@@ -2475,7 +2494,11 @@ impl TimelineMetrics {
             &shard_id,
             &timeline_id,
         );
-        let last_record_gauge = LAST_RECORD_LSN
+        let last_record_lsn_gauge = LAST_RECORD_LSN
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
+
+        let disk_consistent_lsn_gauge = DISK_CONSISTENT_LSN
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
 
@@ -2578,7 +2601,8 @@ impl TimelineMetrics {
             garbage_collect_histo,
             find_gc_cutoffs_histo,
             load_layer_map_histo,
-            last_record_gauge,
+            last_record_lsn_gauge,
+            disk_consistent_lsn_gauge,
             pitr_history_size,
             archival_size,
             layer_size_image,
@@ -2642,6 +2666,7 @@ impl TimelineMetrics {
         let timeline_id = &self.timeline_id;
         let shard_id = &self.shard_id;
         let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
+        let _ = DISK_CONSISTENT_LSN.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = FLUSH_WAIT_UPLOAD_TIME.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = STANDBY_HORIZON.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         {
@@ -2805,6 +2830,7 @@ pub(crate) struct RemoteTimelineClientMetrics {
     calls: Mutex<HashMap<(&'static str, &'static str), IntCounterPair>>,
     bytes_started_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
     bytes_finished_counter: Mutex<HashMap<(&'static str, &'static str), IntCounter>>,
+    pub(crate) projected_remote_consistent_lsn_gauge: UIntGauge,
 }
 
 impl RemoteTimelineClientMetrics {
@@ -2819,6 +2845,10 @@ impl RemoteTimelineClientMetrics {
                 .unwrap(),
         );
 
+        let projected_remote_consistent_lsn_gauge = PROJECTED_REMOTE_CONSISTENT_LSN
+            .get_metric_with_label_values(&[&tenant_id_str, &shard_id_str, &timeline_id_str])
+            .unwrap();
+
         RemoteTimelineClientMetrics {
             tenant_id: tenant_id_str,
             shard_id: shard_id_str,
@@ -2827,6 +2857,7 @@ impl RemoteTimelineClientMetrics {
             bytes_started_counter: Mutex::new(HashMap::default()),
             bytes_finished_counter: Mutex::new(HashMap::default()),
             remote_physical_size_gauge,
+            projected_remote_consistent_lsn_gauge,
         }
     }
 
@@ -3040,6 +3071,7 @@ impl Drop for RemoteTimelineClientMetrics {
             calls,
             bytes_started_counter,
             bytes_finished_counter,
+            projected_remote_consistent_lsn_gauge,
         } = self;
         for ((a, b), _) in calls.get_mut().unwrap().drain() {
             let mut res = [Ok(()), Ok(())];
@@ -3069,6 +3101,14 @@ impl Drop for RemoteTimelineClientMetrics {
             let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
             let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         }
+        {
+            let _ = projected_remote_consistent_lsn_gauge;
+            let _ = PROJECTED_REMOTE_CONSISTENT_LSN.remove_label_values(&[
+                tenant_id,
+                shard_id,
+                timeline_id,
+            ]);
+        }
     }
 }
 
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 89b935947d93..20e0536a00e5 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -2192,6 +2192,9 @@ impl RemoteTimelineClient {
                     upload_queue.clean.1 = Some(task.task_id);
 
                     let lsn = upload_queue.clean.0.metadata.disk_consistent_lsn();
+                    self.metrics
+                        .projected_remote_consistent_lsn_gauge
+                        .set(lsn.0);
 
                     if self.generation.is_none() {
                         // Legacy mode: skip validating generation
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index aab6703a3c2b..bf3d7a74a35d 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2392,7 +2392,7 @@ impl Timeline {
 
             result
                 .metrics
-                .last_record_gauge
+                .last_record_lsn_gauge
                 .set(disk_consistent_lsn.0 as i64);
             result
         })
@@ -3514,7 +3514,7 @@ impl Timeline {
     pub(crate) fn finish_write(&self, new_lsn: Lsn) {
         assert!(new_lsn.is_aligned());
 
-        self.metrics.last_record_gauge.set(new_lsn.0 as i64);
+        self.metrics.last_record_lsn_gauge.set(new_lsn.0 as i64);
         self.last_record_lsn.advance(new_lsn);
     }
 
@@ -3882,6 +3882,10 @@ impl Timeline {
     fn set_disk_consistent_lsn(&self, new_value: Lsn) -> bool {
         let old_value = self.disk_consistent_lsn.fetch_max(new_value);
         assert!(new_value >= old_value, "disk_consistent_lsn must be growing monotonously at runtime; current {old_value}, offered {new_value}");
+
+        self.metrics
+            .disk_consistent_lsn_gauge
+            .set(new_value.0 as i64);
         new_value != old_value
     }
 
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index ffdbd988a58f..1278ed1aef54 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -152,6 +152,8 @@ def counter(name: str) -> str:
     "pageserver_resident_physical_size",
     "pageserver_io_operations_bytes_total",
     "pageserver_last_record_lsn",
+    "pageserver_disk_consistent_lsn",
+    "pageserver_projected_remote_consistent_lsn",
     "pageserver_standby_horizon",
     "pageserver_smgr_query_seconds_bucket",
     "pageserver_smgr_query_seconds_count",

From 7838659197e40ecdb0735c01cb21dd2298492d24 Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 6 Dec 2024 11:24:13 +0100
Subject: [PATCH 55/65] pageserver: assert that keys belong to shard (#9943)

We've seen cases where stray keys end up on the wrong shard. This
shouldn't happen. Add debug assertions to prevent this. In release
builds, we should be lenient in order to handle changing key ownership
policies.

Touches #9914.
---
 libs/pageserver_api/src/shard.rs             |  5 +++--
 libs/utils/src/shard.rs                      |  6 ++++++
 pageserver/src/tenant/timeline.rs            | 19 ++++++++++++++++++-
 pageserver/src/tenant/timeline/compaction.rs | 16 +++++++++++-----
 4 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/libs/pageserver_api/src/shard.rs b/libs/pageserver_api/src/shard.rs
index a5c94a82c162..cf0cd3a46b88 100644
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -158,7 +158,8 @@ impl ShardIdentity {
         key_to_shard_number(self.count, self.stripe_size, key)
     }
 
-    /// Return true if the key should be ingested by this shard
+    /// Return true if the key is stored only on this shard. This does not include
+    /// global keys, see is_key_global().
     ///
     /// Shards must ingest _at least_ keys which return true from this check.
     pub fn is_key_local(&self, key: &Key) -> bool {
@@ -171,7 +172,7 @@ impl ShardIdentity {
     }
 
     /// Return true if the key should be stored on all shards, not just one.
-    fn is_key_global(&self, key: &Key) -> bool {
+    pub fn is_key_global(&self, key: &Key) -> bool {
         if key.is_slru_block_key() || key.is_slru_segment_size_key() || key.is_aux_file_key() {
             // Special keys that are only stored on shard 0
             false
diff --git a/libs/utils/src/shard.rs b/libs/utils/src/shard.rs
index 782cddc599b0..6352ea9f9253 100644
--- a/libs/utils/src/shard.rs
+++ b/libs/utils/src/shard.rs
@@ -164,6 +164,12 @@ impl TenantShardId {
     }
 }
 
+impl std::fmt::Display for ShardNumber {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        self.0.fmt(f)
+    }
+}
+
 impl std::fmt::Display for ShardSlug<'_> {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         write!(
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index bf3d7a74a35d..0657d1af3a84 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -53,7 +53,7 @@ use utils::{
     postgres_client::PostgresClientProtocol,
     sync::gate::{Gate, GateGuard},
 };
-use wal_decoder::serialized_batch::SerializedValueBatch;
+use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};
 
 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::{Arc, Mutex, RwLock, Weak};
@@ -5924,6 +5924,23 @@ impl<'a> TimelineWriter<'a> {
             return Ok(());
         }
 
+        // In debug builds, assert that we don't write any keys that don't belong to this shard.
+        // We don't assert this in release builds, since key ownership policies may change over
+        // time. Stray keys will be removed during compaction.
+        if cfg!(debug_assertions) {
+            for metadata in &batch.metadata {
+                if let ValueMeta::Serialized(metadata) = metadata {
+                    let key = Key::from_compact(metadata.key);
+                    assert!(
+                        self.shard_identity.is_key_local(&key)
+                            || self.shard_identity.is_key_global(&key),
+                        "key {key} does not belong on shard {}",
+                        self.shard_identity.shard_index()
+                    );
+                }
+            }
+        }
+
         let batch_max_lsn = batch.max_lsn;
         let buf_size: u64 = batch.buffer_size() as u64;
 
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 8ececa2bfb46..7f86ede0436c 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -1179,11 +1179,12 @@ impl Timeline {
                     .await
                     .map_err(CompactionError::Other)?;
             } else {
-                debug!(
-                    "Dropping key {} during compaction (it belongs on shard {:?})",
-                    key,
-                    self.shard_identity.get_shard_number(&key)
-                );
+                let shard = self.shard_identity.shard_index();
+                let owner = self.shard_identity.get_shard_number(&key);
+                if cfg!(debug_assertions) {
+                    panic!("key {key} does not belong on shard {shard}, owned by {owner}");
+                }
+                debug!("dropping key {key} during compaction (it belongs on shard {owner})");
             }
 
             if !new_layers.is_empty() {
@@ -2054,6 +2055,11 @@ impl Timeline {
                 // This is not handled in the filter iterator because shard is determined by hash.
                 // Therefore, it does not give us any performance benefit to do things like skip
                 // a whole layer file as handling key spaces (ranges).
+                if cfg!(debug_assertions) {
+                    let shard = self.shard_identity.shard_index();
+                    let owner = self.shard_identity.get_shard_number(&key);
+                    panic!("key {key} does not belong on shard {shard}, owned by {owner}");
+                }
                 continue;
             }
             if !job_desc.compaction_key_range.contains(&key) {

From fa07097f2ff12b6560f4122e0654b24e5f9561e2 Mon Sep 17 00:00:00 2001
From: Alexey Kondratov <kondratov.aleksey@gmail.com>
Date: Fri, 6 Dec 2024 12:44:50 +0100
Subject: [PATCH 56/65] chore: Reorganize and refresh CODEOWNERS (#10008)

## Problem

We didn't have a codeowner for `/compute`, so nobody was auto-assigned
for PRs like #9973

## Summary of changes

While on it:
1. Group codeowners into sections.
2. Remove control plane from the `/compute_tools` because it's primarily
the internal `compute_ctl` code.
3. Add control plane (and compute) to `/libs/compute_api` because that's
the shared public interface of the compute.
---
 CODEOWNERS | 33 +++++++++++++++++++++++----------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/CODEOWNERS b/CODEOWNERS
index f41462c98b1c..71b5e65f94a3 100644
--- a/CODEOWNERS
+++ b/CODEOWNERS
@@ -1,16 +1,29 @@
-/.github/ @neondatabase/developer-productivity
-/compute_tools/ @neondatabase/control-plane @neondatabase/compute
-/libs/pageserver_api/ @neondatabase/storage
-/libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage
-/libs/proxy/ @neondatabase/proxy
-/libs/remote_storage/ @neondatabase/storage
-/libs/safekeeper_api/ @neondatabase/storage
+# Autoscaling
 /libs/vm_monitor/ @neondatabase/autoscaling
-/pageserver/ @neondatabase/storage
+
+# DevProd
+/.github/ @neondatabase/developer-productivity
+
+# Compute
 /pgxn/ @neondatabase/compute
-/pgxn/neon/ @neondatabase/compute @neondatabase/storage
+/vendor/ @neondatabase/compute
+/compute/ @neondatabase/compute
+/compute_tools/ @neondatabase/compute
+
+# Proxy
+/libs/proxy/ @neondatabase/proxy
 /proxy/ @neondatabase/proxy
+
+# Storage
+/pageserver/ @neondatabase/storage
 /safekeeper/ @neondatabase/storage
 /storage_controller @neondatabase/storage
 /storage_scrubber @neondatabase/storage
-/vendor/ @neondatabase/compute
+/libs/pageserver_api/ @neondatabase/storage
+/libs/remote_storage/ @neondatabase/storage
+/libs/safekeeper_api/ @neondatabase/storage
+
+# Shared
+/pgxn/neon/ @neondatabase/compute @neondatabase/storage
+/libs/compute_api/ @neondatabase/compute @neondatabase/control-plane
+/libs/postgres_ffi/ @neondatabase/compute @neondatabase/storage

From cc70fc802d2107122b330dba6ce8e2d8f8799189 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Fri, 6 Dec 2024 12:51:41 +0000
Subject: [PATCH 57/65] pageserver: add metric for number of wal records
 received by each shard (#10035)

## Problem

With the current metrics we can't identify which shards are ingesting
data at any given time.

## Summary of changes

Add a metric for the number of wal records received for processing by
each shard. This is per (tenant, timeline, shard).
---
 pageserver/src/metrics.rs                     | 20 +++++++++++++++++++
 .../walreceiver/walreceiver_connection.rs     |  8 ++++++++
 test_runner/fixtures/metrics.py               |  1 +
 3 files changed, 29 insertions(+)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index e3b6f43bc4db..62bf9acf01cb 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2204,6 +2204,15 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
     .expect("failed to define a metric"),
 });
 
+pub(crate) static PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_timeline_wal_records_received",
+        "Number of WAL records received per shard",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "pageserver_wal_redo_seconds",
@@ -2431,6 +2440,7 @@ pub(crate) struct TimelineMetrics {
     pub evictions_with_low_residence_duration: std::sync::RwLock<EvictionsWithLowResidenceDuration>,
     /// Number of valid LSN leases.
     pub valid_lsn_lease_count_gauge: UIntGauge,
+    pub wal_records_received: IntCounter,
     shutdown: std::sync::atomic::AtomicBool,
 }
 
@@ -2588,6 +2598,10 @@ impl TimelineMetrics {
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
 
+        let wal_records_received = PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
+
         TimelineMetrics {
             tenant_id,
             shard_id,
@@ -2620,6 +2634,7 @@ impl TimelineMetrics {
                 evictions_with_low_residence_duration,
             ),
             valid_lsn_lease_count_gauge,
+            wal_records_received,
             shutdown: std::sync::atomic::AtomicBool::default(),
         }
     }
@@ -2757,6 +2772,11 @@ impl TimelineMetrics {
             shard_id,
             timeline_id,
         ]);
+        let _ = PAGESERVER_TIMELINE_WAL_RECORDS_RECEIVED.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+        ]);
     }
 }
 
diff --git a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
index d90ffbfa2c11..3f10eeda60a9 100644
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -369,6 +369,13 @@ pub(super) async fn handle_walreceiver_connection(
                 // advances it to its end LSN. 0 is just an initialization placeholder.
                 let mut modification = timeline.begin_modification(Lsn(0));
 
+                if !records.is_empty() {
+                    timeline
+                        .metrics
+                        .wal_records_received
+                        .inc_by(records.len() as u64);
+                }
+
                 for interpreted in records {
                     if matches!(interpreted.flush_uncommitted, FlushUncommittedRecords::Yes)
                         && uncommitted_records > 0
@@ -510,6 +517,7 @@ pub(super) async fn handle_walreceiver_connection(
                         }
 
                         // Ingest the records without immediately committing them.
+                        timeline.metrics.wal_records_received.inc();
                         let ingested = walingest
                             .ingest_record(interpreted, &mut modification, &ctx)
                             .await
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 1278ed1aef54..52ed7da36b3b 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -175,6 +175,7 @@ def counter(name: str) -> str:
     counter("pageserver_tenant_throttling_count_accounted_finish"),
     counter("pageserver_tenant_throttling_wait_usecs_sum"),
     counter("pageserver_tenant_throttling_count"),
+    counter("pageserver_timeline_wal_records_received"),
     *histogram("pageserver_page_service_batch_size"),
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
     # "pageserver_directory_entries_count", -- only used if above a certain threshold

From 14c4fae64af5613c682ec7dd7d30e484c476e5af Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Fri, 6 Dec 2024 16:17:15 +0100
Subject: [PATCH 58/65] test_runner/performance: add improved bulk insert
 benchmark (#9812)

Adds an improved bulk insert benchmark, including S3 uploads.

Touches #9789.
---
 test_runner/fixtures/pageserver/utils.py      |  22 +--
 .../performance/test_ingest_insert_bulk.py    | 142 ++++++++++++++++++
 2 files changed, 149 insertions(+), 15 deletions(-)
 create mode 100644 test_runner/performance/test_ingest_insert_bulk.py

diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 7c10edc5fc33..66f61f9b4c37 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -54,23 +54,15 @@ def wait_for_upload(
     tenant: TenantId | TenantShardId,
     timeline: TimelineId,
     lsn: Lsn,
+    timeout=20,
 ):
-    """waits for local timeline upload up to specified lsn"""
+    """Waits for local timeline upload up to specified LSN"""
 
-    current_lsn = Lsn(0)
-    for i in range(20):
-        current_lsn = remote_consistent_lsn(pageserver_http, tenant, timeline)
-        if current_lsn >= lsn:
-            log.info("wait finished")
-            return
-        lr_lsn = last_record_lsn(pageserver_http, tenant, timeline)
-        log.info(
-            f"waiting for remote_consistent_lsn to reach {lsn}, now {current_lsn}, last_record_lsn={lr_lsn}, iteration {i + 1}"
-        )
-        time.sleep(1)
-    raise Exception(
-        f"timed out while waiting for {tenant}/{timeline} remote_consistent_lsn to reach {lsn}, was {current_lsn}"
-    )
+    def is_uploaded():
+        remote_lsn = remote_consistent_lsn(pageserver_http, tenant, timeline)
+        assert remote_lsn >= lsn, f"remote_consistent_lsn at {remote_lsn}"
+
+    wait_until(is_uploaded, name=f"upload to {lsn}", timeout=timeout)
 
 
 def _tenant_in_expected_state(tenant_info: dict[str, Any], expected_state: str):
diff --git a/test_runner/performance/test_ingest_insert_bulk.py b/test_runner/performance/test_ingest_insert_bulk.py
new file mode 100644
index 000000000000..283bcada31bd
--- /dev/null
+++ b/test_runner/performance/test_ingest_insert_bulk.py
@@ -0,0 +1,142 @@
+from __future__ import annotations
+
+import random
+from concurrent.futures import ThreadPoolExecutor
+
+import pytest
+from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
+from fixtures.common_types import Lsn
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnvBuilder,
+    wait_for_last_flush_lsn,
+)
+from fixtures.pageserver.utils import (
+    wait_for_last_record_lsn,
+    wait_for_upload,
+    wait_for_upload_queue_empty,
+)
+from fixtures.remote_storage import s3_storage
+
+
+@pytest.mark.timeout(900)
+@pytest.mark.parametrize("size", [8, 1024, 8192])
+@pytest.mark.parametrize("s3", [True, False], ids=["s3", "local"])
+@pytest.mark.parametrize("backpressure", [True, False], ids=["backpressure", "nobackpressure"])
+@pytest.mark.parametrize("fsync", [True, False], ids=["fsync", "nofsync"])
+def test_ingest_insert_bulk(
+    request: pytest.FixtureRequest,
+    neon_env_builder: NeonEnvBuilder,
+    zenbenchmark: NeonBenchmarker,
+    fsync: bool,
+    backpressure: bool,
+    s3: bool,
+    size: int,
+):
+    """
+    Benchmarks ingestion of 5 GB of sequential insert WAL. Measures ingestion and S3 upload
+    separately. Also does a Safekeeper→Pageserver re-ingestion to measure Pageserver ingestion in
+    isolation.
+    """
+
+    CONCURRENCY = 1  # 1 is optimal without fsync or backpressure
+    VOLUME = 5 * 1024**3
+    rows = VOLUME // (size + 64)  # +64 roughly accounts for per-row WAL overhead
+
+    neon_env_builder.safekeepers_enable_fsync = fsync
+
+    if s3:
+        neon_env_builder.enable_pageserver_remote_storage(s3_storage())
+        # NB: don't use S3 for Safekeeper. It doesn't affect throughput (no backpressure), but it
+        # would compete with Pageserver for bandwidth.
+        # neon_env_builder.enable_safekeeper_remote_storage(s3_storage())
+
+    neon_env_builder.disable_scrub_on_exit()  # immediate shutdown may leave stray layers
+    env = neon_env_builder.init_start()
+
+    endpoint = env.endpoints.create_start(
+        "main",
+        config_lines=[
+            f"fsync = {fsync}",
+            "max_replication_apply_lag = 0",
+            f"max_replication_flush_lag = {'10GB' if backpressure else '0'}",
+            # NB: neon_local defaults to 15MB, which is too slow -- production uses 500MB.
+            f"max_replication_write_lag = {'500MB' if backpressure else '0'}",
+        ],
+    )
+    endpoint.safe_psql("create extension neon")
+
+    # Wait for the timeline to be propagated to the pageserver.
+    wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, env.initial_timeline)
+
+    # Ingest rows.
+    log.info("Ingesting data")
+    start_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
+
+    def insert_rows(endpoint, table, count, value):
+        with endpoint.connect().cursor() as cur:
+            cur.execute("set statement_timeout = 0")
+            cur.execute(f"create table {table} (id int, data bytea)")
+            cur.execute(f"insert into {table} values (generate_series(1, {count}), %s)", (value,))
+
+    with zenbenchmark.record_duration("upload"):
+        with zenbenchmark.record_duration("ingest"):
+            with ThreadPoolExecutor(max_workers=CONCURRENCY) as pool:
+                for i in range(CONCURRENCY):
+                    # Write a random value for all rows. This is sufficient to prevent compression,
+                    # e.g. in TOAST. Randomly generating every row is too slow.
+                    value = random.randbytes(size)
+                    worker_rows = rows / CONCURRENCY
+                    pool.submit(insert_rows, endpoint, f"table{i}", worker_rows, value)
+
+        end_lsn = Lsn(endpoint.safe_psql("select pg_current_wal_lsn()")[0][0])
+
+        # Wait for pageserver to ingest the WAL.
+        client = env.pageserver.http_client()
+        wait_for_last_record_lsn(client, env.initial_tenant, env.initial_timeline, end_lsn)
+
+        # Wait for pageserver S3 upload. Checkpoint to flush the last in-memory layer.
+        client.timeline_checkpoint(
+            env.initial_tenant,
+            env.initial_timeline,
+            compact=False,
+            wait_until_flushed=False,
+        )
+        wait_for_upload(client, env.initial_tenant, env.initial_timeline, end_lsn, timeout=600)
+
+    # Empty out upload queue for next benchmark.
+    wait_for_upload_queue_empty(client, env.initial_tenant, env.initial_timeline)
+
+    backpressure_time = endpoint.safe_psql("select backpressure_throttling_time()")[0][0]
+
+    # Now that all data is ingested, delete and recreate the tenant in the pageserver. This will
+    # reingest all the WAL directly from the safekeeper. This gives us a baseline of how fast the
+    # pageserver can ingest this WAL in isolation.
+    status = env.storage_controller.inspect(tenant_shard_id=env.initial_tenant)
+    assert status is not None
+
+    endpoint.stop()  # avoid spurious getpage errors
+    client.tenant_delete(env.initial_tenant)
+    env.pageserver.tenant_create(tenant_id=env.initial_tenant, generation=status[0])
+
+    with zenbenchmark.record_duration("recover"):
+        log.info("Recovering WAL into pageserver")
+        client.timeline_create(env.pg_version, env.initial_tenant, env.initial_timeline)
+        wait_for_last_record_lsn(client, env.initial_tenant, env.initial_timeline, end_lsn)
+
+    # Emit metrics.
+    wal_written_mb = round((end_lsn - start_lsn) / (1024 * 1024))
+    zenbenchmark.record("wal_written", wal_written_mb, "MB", MetricReport.TEST_PARAM)
+    zenbenchmark.record("row_count", rows, "rows", MetricReport.TEST_PARAM)
+    zenbenchmark.record("concurrency", CONCURRENCY, "clients", MetricReport.TEST_PARAM)
+    zenbenchmark.record(
+        "backpressure_time", backpressure_time // 1000, "ms", MetricReport.LOWER_IS_BETTER
+    )
+
+    props = {p["name"]: p["value"] for _, p in request.node.user_properties}
+    for name in ("ingest", "upload", "recover"):
+        throughput = int(wal_written_mb / props[name])
+        zenbenchmark.record(f"{name}_throughput", throughput, "MB/s", MetricReport.HIGHER_IS_BETTER)
+
+    # Pageserver shutdown will likely get stuck on the upload queue, just shut it down immediately.
+    env.stop(immediate=True)

From e4837b0a5a65e8515949fad634d147cb2c2a8caf Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Fri, 6 Dec 2024 11:43:55 -0600
Subject: [PATCH 59/65] Bump sql_exporter to 0.16.0 (#10041)

Signed-off-by: Tristan Partin <tristan@neon.tech>
---
 build-tools.Dockerfile                      | 2 +-
 compute/compute-node.Dockerfile             | 2 +-
 test_runner/regress/test_compute_metrics.py | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/build-tools.Dockerfile b/build-tools.Dockerfile
index 2671702697b7..fa84e467ad61 100644
--- a/build-tools.Dockerfile
+++ b/build-tools.Dockerfile
@@ -115,7 +115,7 @@ RUN set -e \
 
 # Keep the version the same as in compute/compute-node.Dockerfile and
 # test_runner/regress/test_compute_metrics.py.
-ENV SQL_EXPORTER_VERSION=0.13.1
+ENV SQL_EXPORTER_VERSION=0.16.0
 RUN curl -fsSL \
     "https://github.com/burningalchemist/sql_exporter/releases/download/${SQL_EXPORTER_VERSION}/sql_exporter-${SQL_EXPORTER_VERSION}.linux-$(case "$(uname -m)" in x86_64) echo amd64;; aarch64) echo arm64;; esac).tar.gz" \
     --output sql_exporter.tar.gz \
diff --git a/compute/compute-node.Dockerfile b/compute/compute-node.Dockerfile
index bf6311bf2b8d..33d2a1028521 100644
--- a/compute/compute-node.Dockerfile
+++ b/compute/compute-node.Dockerfile
@@ -1324,7 +1324,7 @@ FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.1 AS postgres-exporter
 
 # Keep the version the same as in build-tools.Dockerfile and
 # test_runner/regress/test_compute_metrics.py.
-FROM burningalchemist/sql_exporter:0.13.1 AS sql-exporter
+FROM burningalchemist/sql_exporter:0.16.0 AS sql-exporter
 
 #########################################################################################
 #
diff --git a/test_runner/regress/test_compute_metrics.py b/test_runner/regress/test_compute_metrics.py
index 1b15c5f15efa..787790103fa4 100644
--- a/test_runner/regress/test_compute_metrics.py
+++ b/test_runner/regress/test_compute_metrics.py
@@ -215,7 +215,7 @@ def __init__(
             #
             # The "host" network mode allows sql_exporter to talk to the
             # endpoint which is running on the host.
-            super().__init__("docker.io/burningalchemist/sql_exporter:0.13.1", network_mode="host")
+            super().__init__("docker.io/burningalchemist/sql_exporter:0.16.0", network_mode="host")
 
             self.__logs_dir = logs_dir
             self.__port = port

From c42c28b339289a872400a4e9f0d1b4cc02048354 Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 6 Dec 2024 13:44:26 -0500
Subject: [PATCH 60/65] feat(pageserver): gc-compaction split job and partial
 scheduler (#9897)

## Problem

part of https://github.com/neondatabase/neon/issues/9114, stacked PR
over #9809

The compaction scheduler now schedules partial compaction jobs.

## Summary of changes

* Add the compaction job splitter based on size.
* Schedule subcompactions using the compaction scheduler.
* Test subcompaction scheduler in the smoke regress test.
* Temporarily disable layer map checks

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 pageserver/src/http/routes.rs                |  10 +-
 pageserver/src/tenant.rs                     |  49 +++++-
 pageserver/src/tenant/timeline.rs            |   7 +
 pageserver/src/tenant/timeline/compaction.rs | 162 +++++++++++++++++--
 test_runner/regress/test_compaction.py       |   1 +
 5 files changed, 209 insertions(+), 20 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index b7fddb065c61..0f11bbc50790 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -2036,15 +2036,23 @@ async fn timeline_compact_handler(
         parse_query_param::<_, bool>(&request, "wait_until_scheduled_compaction_done")?
             .unwrap_or(false);
 
+    let sub_compaction = compact_request
+        .as_ref()
+        .map(|r| r.sub_compaction)
+        .unwrap_or(false);
     let options = CompactOptions {
         compact_range: compact_request
             .as_ref()
             .and_then(|r| r.compact_range.clone()),
         compact_below_lsn: compact_request.as_ref().and_then(|r| r.compact_below_lsn),
         flags,
+        sub_compaction,
     };
 
-    let scheduled = compact_request.map(|r| r.scheduled).unwrap_or(false);
+    let scheduled = compact_request
+        .as_ref()
+        .map(|r| r.scheduled)
+        .unwrap_or(false);
 
     async {
         let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 306ec9f5486e..4a9c44aefdbc 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -49,6 +49,7 @@ use timeline::import_pgdata;
 use timeline::offload::offload_timeline;
 use timeline::CompactFlags;
 use timeline::CompactOptions;
+use timeline::CompactionError;
 use timeline::ShutdownMode;
 use tokio::io::BufReader;
 use tokio::sync::watch;
@@ -2987,10 +2988,16 @@ impl Tenant {
                 if has_pending_l0_compaction_task {
                     Some(true)
                 } else {
-                    let has_pending_scheduled_compaction_task;
+                    let mut has_pending_scheduled_compaction_task;
                     let next_scheduled_compaction_task = {
                         let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
                         if let Some(tline_pending_tasks) = guard.get_mut(timeline_id) {
+                            if !tline_pending_tasks.is_empty() {
+                                info!(
+                                    "{} tasks left in the compaction schedule queue",
+                                    tline_pending_tasks.len()
+                                );
+                            }
                             let next_task = tline_pending_tasks.pop_front();
                             has_pending_scheduled_compaction_task = !tline_pending_tasks.is_empty();
                             next_task
@@ -3007,6 +3014,32 @@ impl Tenant {
                             .contains(CompactFlags::EnhancedGcBottomMostCompaction)
                         {
                             warn!("ignoring scheduled compaction task: scheduled task must be gc compaction: {:?}", next_scheduled_compaction_task.options);
+                        } else if next_scheduled_compaction_task.options.sub_compaction {
+                            info!("running scheduled enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
+                            let jobs = timeline
+                                .gc_compaction_split_jobs(next_scheduled_compaction_task.options)
+                                .await
+                                .map_err(CompactionError::Other)?;
+                            if jobs.is_empty() {
+                                info!("no jobs to run, skipping scheduled compaction task");
+                            } else {
+                                has_pending_scheduled_compaction_task = true;
+                                let jobs_len = jobs.len();
+                                let mut guard = self.scheduled_compaction_tasks.lock().unwrap();
+                                let tline_pending_tasks = guard.entry(*timeline_id).or_default();
+                                for (idx, job) in jobs.into_iter().enumerate() {
+                                    tline_pending_tasks.push_back(ScheduledCompactionTask {
+                                        options: job,
+                                        result_tx: if idx == jobs_len - 1 {
+                                            // The last compaction job sends the completion signal
+                                            next_scheduled_compaction_task.result_tx.take()
+                                        } else {
+                                            None
+                                        },
+                                    });
+                                }
+                                info!("scheduled enhanced gc bottom-most compaction with sub-compaction, split into {} jobs", jobs_len);
+                            }
                         } else {
                             let _ = timeline
                                 .compact_with_options(
@@ -9244,7 +9277,7 @@ mod tests {
                 CompactOptions {
                     flags: dryrun_flags,
                     compact_range: None,
-                    compact_below_lsn: None,
+                    ..Default::default()
                 },
                 &ctx,
             )
@@ -9481,7 +9514,7 @@ mod tests {
                 CompactOptions {
                     flags: dryrun_flags,
                     compact_range: None,
-                    compact_below_lsn: None,
+                    ..Default::default()
                 },
                 &ctx,
             )
@@ -9973,7 +10006,7 @@ mod tests {
                 CompactOptions {
                     flags: EnumSet::new(),
                     compact_range: Some((get_key(0)..get_key(2)).into()),
-                    compact_below_lsn: None,
+                    ..Default::default()
                 },
                 &ctx,
             )
@@ -10020,7 +10053,7 @@ mod tests {
                 CompactOptions {
                     flags: EnumSet::new(),
                     compact_range: Some((get_key(2)..get_key(4)).into()),
-                    compact_below_lsn: None,
+                    ..Default::default()
                 },
                 &ctx,
             )
@@ -10072,7 +10105,7 @@ mod tests {
                 CompactOptions {
                     flags: EnumSet::new(),
                     compact_range: Some((get_key(4)..get_key(9)).into()),
-                    compact_below_lsn: None,
+                    ..Default::default()
                 },
                 &ctx,
             )
@@ -10123,7 +10156,7 @@ mod tests {
                 CompactOptions {
                     flags: EnumSet::new(),
                     compact_range: Some((get_key(9)..get_key(10)).into()),
-                    compact_below_lsn: None,
+                    ..Default::default()
                 },
                 &ctx,
             )
@@ -10179,7 +10212,7 @@ mod tests {
                 CompactOptions {
                     flags: EnumSet::new(),
                     compact_range: Some((get_key(0)..get_key(10)).into()),
-                    compact_below_lsn: None,
+                    ..Default::default()
                 },
                 &ctx,
             )
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 0657d1af3a84..8f1d5f6577a6 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -785,6 +785,9 @@ pub(crate) struct CompactRequest {
     /// Whether the compaction job should be scheduled.
     #[serde(default)]
     pub scheduled: bool,
+    /// Whether the compaction job should be split across key ranges.
+    #[serde(default)]
+    pub sub_compaction: bool,
 }
 
 #[serde_with::serde_as]
@@ -814,6 +817,9 @@ pub(crate) struct CompactOptions {
     /// If set, the compaction will only compact the LSN below this value.
     /// This option is only used by GC compaction.
     pub compact_below_lsn: Option<Lsn>,
+    /// Enable sub-compaction (split compaction job across key ranges).
+    /// This option is only used by GC compaction.
+    pub sub_compaction: bool,
 }
 
 impl std::fmt::Debug for Timeline {
@@ -1637,6 +1643,7 @@ impl Timeline {
                 flags,
                 compact_range: None,
                 compact_below_lsn: None,
+                sub_compaction: false,
             },
             ctx,
         )
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index 7f86ede0436c..a18e157d37b4 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -10,8 +10,8 @@ use std::sync::Arc;
 
 use super::layer_manager::LayerManager;
 use super::{
-    CompactFlags, CompactOptions, CreateImageLayersError, DurationRecorder, ImageLayerCreationMode,
-    RecordedDuration, Timeline,
+    CompactFlags, CompactOptions, CompactRange, CreateImageLayersError, DurationRecorder,
+    ImageLayerCreationMode, RecordedDuration, Timeline,
 };
 
 use anyhow::{anyhow, bail, Context};
@@ -29,7 +29,6 @@ use utils::id::TimelineId;
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
 use crate::statvfs::Statvfs;
-use crate::tenant::checks::check_valid_layermap;
 use crate::tenant::remote_timeline_client::WaitCompletionError;
 use crate::tenant::storage_layer::batch_split_writer::{
     BatchWriterResult, SplitDeltaLayerWriter, SplitImageLayerWriter,
@@ -1752,6 +1751,116 @@ impl Timeline {
         Ok(())
     }
 
+    /// Split a gc-compaction job into multiple compaction jobs. Optimally, this function should return a vector of
+    /// `GcCompactionJobDesc`. But we want to keep it simple on the tenant scheduling side without exposing too much
+    /// ad-hoc information about gc compaction itself.
+    pub(crate) async fn gc_compaction_split_jobs(
+        self: &Arc<Self>,
+        options: CompactOptions,
+    ) -> anyhow::Result<Vec<CompactOptions>> {
+        if !options.sub_compaction {
+            return Ok(vec![options]);
+        }
+        let compact_range = options.compact_range.clone().unwrap_or(CompactRange {
+            start: Key::MIN,
+            end: Key::MAX,
+        });
+        let compact_below_lsn = if let Some(compact_below_lsn) = options.compact_below_lsn {
+            compact_below_lsn
+        } else {
+            let gc_info = self.gc_info.read().unwrap();
+            gc_info.cutoffs.select_min() // use the real gc cutoff
+        };
+        let mut compact_jobs = Vec::new();
+        // For now, we simply use the key partitioning information; we should do a more fine-grained partitioning
+        // by estimating the amount of files read for a compaction job. We should also partition on LSN.
+        let Ok(partition) = self.partitioning.try_lock() else {
+            bail!("failed to acquire partition lock");
+        };
+        let ((dense_ks, sparse_ks), _) = &*partition;
+        // Truncate the key range to be within user specified compaction range.
+        fn truncate_to(
+            source_start: &Key,
+            source_end: &Key,
+            target_start: &Key,
+            target_end: &Key,
+        ) -> Option<(Key, Key)> {
+            let start = source_start.max(target_start);
+            let end = source_end.min(target_end);
+            if start < end {
+                Some((*start, *end))
+            } else {
+                None
+            }
+        }
+        let mut split_key_ranges = Vec::new();
+        let ranges = dense_ks
+            .parts
+            .iter()
+            .map(|partition| partition.ranges.iter())
+            .chain(sparse_ks.parts.iter().map(|x| x.0.ranges.iter()))
+            .flatten()
+            .cloned()
+            .collect_vec();
+        for range in ranges.iter() {
+            let Some((start, end)) = truncate_to(
+                &range.start,
+                &range.end,
+                &compact_range.start,
+                &compact_range.end,
+            ) else {
+                continue;
+            };
+            split_key_ranges.push((start, end));
+        }
+        split_key_ranges.sort();
+        let guard = self.layers.read().await;
+        let layer_map = guard.layer_map()?;
+        let mut current_start = None;
+        // Split compaction job to about 2GB each
+        const GC_COMPACT_MAX_SIZE_MB: u64 = 4 * 1024; // 4GB, TODO: should be configuration in the future
+        let ranges_num = split_key_ranges.len();
+        for (idx, (start, end)) in split_key_ranges.into_iter().enumerate() {
+            if current_start.is_none() {
+                current_start = Some(start);
+            }
+            let start = current_start.unwrap();
+            if start >= end {
+                // We have already processed this partition.
+                continue;
+            }
+            let res = layer_map.range_search(start..end, compact_below_lsn);
+            let total_size = res.found.keys().map(|x| x.layer.file_size()).sum::<u64>();
+            if total_size > GC_COMPACT_MAX_SIZE_MB * 1024 * 1024 || ranges_num == idx + 1 {
+                let mut compact_options = options.clone();
+                // Try to extend the compaction range so that we include at least one full layer file.
+                let extended_end = res
+                    .found
+                    .keys()
+                    .map(|layer| layer.layer.key_range.end)
+                    .min();
+                // It is possible that the search range does not contain any layer files when we reach the end of the loop.
+                // In this case, we simply use the specified key range end.
+                let end = if let Some(extended_end) = extended_end {
+                    extended_end.max(end)
+                } else {
+                    end
+                };
+                info!(
+                    "splitting compaction job: {}..{}, estimated_size={}",
+                    start, end, total_size
+                );
+                compact_options.compact_range = Some(CompactRange { start, end });
+                compact_options.compact_below_lsn = Some(compact_below_lsn);
+                compact_options.sub_compaction = false;
+                compact_jobs.push(compact_options);
+                current_start = Some(end);
+            }
+        }
+        drop(guard);
+        Ok(compact_jobs)
+    }
+
     /// An experimental compaction building block that combines compaction with garbage collection.
     ///
     /// The current implementation picks all delta + image layers that are below or intersecting with
@@ -1774,6 +1883,36 @@ impl Timeline {
         options: CompactOptions,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
+        if options.sub_compaction {
+            info!("running enhanced gc bottom-most compaction with sub-compaction, splitting compaction jobs");
+            let jobs = self.gc_compaction_split_jobs(options).await?;
+            let jobs_len = jobs.len();
+            for (idx, job) in jobs.into_iter().enumerate() {
+                info!(
+                    "running enhanced gc bottom-most compaction, sub-compaction {}/{}",
+                    idx + 1,
+                    jobs_len
+                );
+                self.compact_with_gc_inner(cancel, job, ctx).await?;
+            }
+            if jobs_len == 0 {
+                info!("no jobs to run, skipping gc bottom-most compaction");
+            }
+            return Ok(());
+        }
+        self.compact_with_gc_inner(cancel, options, ctx).await
+    }
+
+    async fn compact_with_gc_inner(
+        self: &Arc<Self>,
+        cancel: &CancellationToken,
+        options: CompactOptions,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        assert!(
+            !options.sub_compaction,
+            "sub-compaction should be handled by the outer function"
+        );
         // Block other compaction/GC tasks from running for now. GC-compaction could run along
         // with legacy compaction tasks in the future. Always ensure the lock order is compaction -> gc.
         // Note that we already acquired the compaction lock when the outer `compact` function gets called.
@@ -1943,14 +2082,15 @@ impl Timeline {
 
         // Step 1: construct a k-merge iterator over all layers.
         // Also, verify if the layer map can be split by drawing a horizontal line at every LSN start/end split point.
-        let layer_names = job_desc
-            .selected_layers
-            .iter()
-            .map(|layer| layer.layer_desc().layer_name())
-            .collect_vec();
-        if let Some(err) = check_valid_layermap(&layer_names) {
-            warn!("gc-compaction layer map check failed because {}, this is normal if partial compaction is not finished yet", err);
-        }
+        // disable the check for now because we need to adjust the check for partial compactions, will enable later.
+        // let layer_names = job_desc
+        //     .selected_layers
+        //     .iter()
+        //     .map(|layer| layer.layer_desc().layer_name())
+        //     .collect_vec();
+        // if let Some(err) = check_valid_layermap(&layer_names) {
+        //     warn!("gc-compaction layer map check failed because {}, this is normal if partial compaction is not finished yet", err);
+        // }
         // The maximum LSN we are processing in this compaction loop
         let end_lsn = job_desc
             .selected_layers
diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index de6653eb3f4e..e92dc47f3980 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -159,6 +159,7 @@ def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
             enhanced_gc_bottom_most_compaction=True,
             body={
                 "scheduled": True,
+                "sub_compaction": True,
                 "compact_range": {
                     "start": "000000000000000000000000000000000000",
                     # skip the SLRU range for now -- it races with get-lsn-by-timestamp, TODO: fix this

From b6eea655976ad7ebffd9b7edbf193850d2b2b05b Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Fri, 6 Dec 2024 22:56:57 +0200
Subject: [PATCH 61/65] Fix error message if PS connection is lost while
 receiving prefetch (#9923)

If the pageserver connection is lost while receiving the prefetch
request, the prefetch queue is cleared. The error message prints the
values from the prefetch slot, but because the slot was already cleared,
they're all zeros:

LOG: [NEON_SMGR] [shard 0] No response from reading prefetch entry 0:
0/0/0.0 block 0. This can be caused by a concurrent disconnect

To fix, make local copies of the values.

In the passing, also add a sanity check that if the receive() call
succeeds, the prefetch slot is still intact.
---
 pgxn/neon/pagestore_smgr.c | 35 ++++++++++++++++++++++++++++++-----
 1 file changed, 30 insertions(+), 5 deletions(-)

diff --git a/pgxn/neon/pagestore_smgr.c b/pgxn/neon/pagestore_smgr.c
index a5e0c402fbb7..880c0de64e61 100644
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -610,6 +610,9 @@ prefetch_read(PrefetchRequest *slot)
 {
 	NeonResponse *response;
 	MemoryContext old;
+	BufferTag	buftag;
+	shardno_t	shard_no;
+	uint64		my_ring_index;
 
 	Assert(slot->status == PRFS_REQUESTED);
 	Assert(slot->response == NULL);
@@ -623,11 +626,29 @@ prefetch_read(PrefetchRequest *slot)
 					   slot->status, slot->response,
 					   (long)slot->my_ring_index, (long)MyPState->ring_receive);
 
+	/*
+	 * Copy the request info so that if an error happens and the prefetch
+	 * queue is flushed during the receive call, we can print the original
+	 * values in the error message
+	 */
+	buftag = slot->buftag;
+	shard_no = slot->shard_no;
+	my_ring_index = slot->my_ring_index;
+
 	old = MemoryContextSwitchTo(MyPState->errctx);
-	response = (NeonResponse *) page_server->receive(slot->shard_no);
+	response = (NeonResponse *) page_server->receive(shard_no);
 	MemoryContextSwitchTo(old);
 	if (response)
 	{
+		/* The slot should still be valid */
+		if (slot->status != PRFS_REQUESTED ||
+			slot->response != NULL ||
+			slot->my_ring_index != MyPState->ring_receive)
+			neon_shard_log(shard_no, ERROR,
+						   "Incorrect prefetch slot state after receive: status=%d response=%p my=%lu receive=%lu",
+						   slot->status, slot->response,
+						   (long) slot->my_ring_index, (long) MyPState->ring_receive);
+
 		/* update prefetch state */
 		MyPState->n_responses_buffered += 1;
 		MyPState->n_requests_inflight -= 1;
@@ -642,11 +663,15 @@ prefetch_read(PrefetchRequest *slot)
 	}
 	else
 	{
-		neon_shard_log(slot->shard_no, LOG,
+		/*
+		 * Note: The slot might no longer be valid, if the connection was lost
+		 * and the prefetch queue was flushed during the receive call
+		 */
+		neon_shard_log(shard_no, LOG,
 					   "No response from reading prefetch entry %lu: %u/%u/%u.%u block %u. This can be caused by a concurrent disconnect",
-					   (long)slot->my_ring_index,
-					   RelFileInfoFmt(BufTagGetNRelFileInfo(slot->buftag)),
-					   slot->buftag.forkNum, slot->buftag.blockNum);
+					   (long) my_ring_index,
+					   RelFileInfoFmt(BufTagGetNRelFileInfo(buftag)),
+					   buftag.forkNum, buftag.blockNum);
 		return false;
 	}
 }

From b1fd086c0c974447376d23cd6e3baf4f8248a1ce Mon Sep 17 00:00:00 2001
From: "Alex Chi Z." <4198311+skyzh@users.noreply.github.com>
Date: Fri, 6 Dec 2024 17:30:04 -0500
Subject: [PATCH 62/65] test(pageserver): disable gc_compaction smoke test for
 now (#10045)

## Problem

The test is flaky.

## Summary of changes

Disable the test.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 test_runner/regress/test_compaction.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test_runner/regress/test_compaction.py b/test_runner/regress/test_compaction.py
index e92dc47f3980..881503046ce3 100644
--- a/test_runner/regress/test_compaction.py
+++ b/test_runner/regress/test_compaction.py
@@ -121,6 +121,9 @@ def test_pageserver_compaction_smoke(
     assert vectored_average < 8
 
 
+@pytest.mark.skip(
+    "This is being fixed and tracked in https://github.com/neondatabase/neon/issues/9114"
+)
 @skip_in_debug_build("only run with release build")
 def test_pageserver_gc_compaction_smoke(neon_env_builder: NeonEnvBuilder):
     SMOKE_CONF = {

From 4d7111f240062e161af1c298ffc5c28b5ed695fe Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Sat, 7 Dec 2024 09:57:55 +0100
Subject: [PATCH 63/65] page_service: don't count time spent flushing towards
 smgr latency metrics (#10042)

## Problem

In #9962 I changed the smgr metrics to include time spent on flush.

It isn't under our (=storage team's) control how long that flush takes
because the client can stop reading requests.

## Summary of changes

Stop the timer as soon as we've buffered up the response in the
`pgb_writer`.

Track flush time in a separate metric.

---------

Co-authored-by: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
---
 pageserver/src/metrics.rs       | 138 +++++++++++++++++++++++++++++---
 pageserver/src/page_service.rs  |  76 ++++++++++++------
 test_runner/fixtures/metrics.py |   1 +
 3 files changed, 179 insertions(+), 36 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 62bf9acf01cb..96ee1578563b 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1223,31 +1223,60 @@ pub(crate) mod virtual_file_io_engine {
     });
 }
 
-pub(crate) struct SmgrOpTimer {
+pub(crate) struct SmgrOpTimer(Option<SmgrOpTimerInner>);
+pub(crate) struct SmgrOpTimerInner {
     global_latency_histo: Histogram,
 
     // Optional because not all op types are tracked per-timeline
     per_timeline_latency_histo: Option<Histogram>,
 
+    global_flush_in_progress_micros: IntCounter,
+    per_timeline_flush_in_progress_micros: IntCounter,
+
     start: Instant,
     throttled: Duration,
     op: SmgrQueryType,
 }
 
+pub(crate) struct SmgrOpFlushInProgress {
+    base: Instant,
+    global_micros: IntCounter,
+    per_timeline_micros: IntCounter,
+}
+
 impl SmgrOpTimer {
     pub(crate) fn deduct_throttle(&mut self, throttle: &Option<Duration>) {
         let Some(throttle) = throttle else {
             return;
         };
-        self.throttled += *throttle;
+        let inner = self.0.as_mut().expect("other public methods consume self");
+        inner.throttled += *throttle;
     }
-}
 
-impl Drop for SmgrOpTimer {
-    fn drop(&mut self) {
-        let elapsed = self.start.elapsed();
+    pub(crate) fn observe_smgr_op_completion_and_start_flushing(mut self) -> SmgrOpFlushInProgress {
+        let (flush_start, inner) = self
+            .smgr_op_end()
+            .expect("this method consume self, and the only other caller is drop handler");
+        let SmgrOpTimerInner {
+            global_flush_in_progress_micros,
+            per_timeline_flush_in_progress_micros,
+            ..
+        } = inner;
+        SmgrOpFlushInProgress {
+            base: flush_start,
+            global_micros: global_flush_in_progress_micros,
+            per_timeline_micros: per_timeline_flush_in_progress_micros,
+        }
+    }
 
-        let elapsed = match elapsed.checked_sub(self.throttled) {
+    /// Returns `None`` if this method has already been called, `Some` otherwise.
+    fn smgr_op_end(&mut self) -> Option<(Instant, SmgrOpTimerInner)> {
+        let inner = self.0.take()?;
+
+        let now = Instant::now();
+        let elapsed = now - inner.start;
+
+        let elapsed = match elapsed.checked_sub(inner.throttled) {
             Some(elapsed) => elapsed,
             None => {
                 use utils::rate_limit::RateLimit;
@@ -1258,9 +1287,9 @@ impl Drop for SmgrOpTimer {
                         })))
                     });
                 let mut guard = LOGGED.lock().unwrap();
-                let rate_limit = &mut guard[self.op];
+                let rate_limit = &mut guard[inner.op];
                 rate_limit.call(|| {
-                    warn!(op=?self.op, ?elapsed, ?self.throttled, "implementation error: time spent throttled exceeds total request wall clock time");
+                    warn!(op=?inner.op, ?elapsed, ?inner.throttled, "implementation error: time spent throttled exceeds total request wall clock time");
                 });
                 elapsed // un-throttled time, more info than just saturating to 0
             }
@@ -1268,10 +1297,54 @@ impl Drop for SmgrOpTimer {
 
         let elapsed = elapsed.as_secs_f64();
 
-        self.global_latency_histo.observe(elapsed);
-        if let Some(per_timeline_getpage_histo) = &self.per_timeline_latency_histo {
+        inner.global_latency_histo.observe(elapsed);
+        if let Some(per_timeline_getpage_histo) = &inner.per_timeline_latency_histo {
             per_timeline_getpage_histo.observe(elapsed);
         }
+
+        Some((now, inner))
+    }
+}
+
+impl Drop for SmgrOpTimer {
+    fn drop(&mut self) {
+        self.smgr_op_end();
+    }
+}
+
+impl SmgrOpFlushInProgress {
+    pub(crate) async fn measure<Fut, O>(mut self, mut fut: Fut) -> O
+    where
+        Fut: std::future::Future<Output = O>,
+    {
+        let mut fut = std::pin::pin!(fut);
+
+        let now = Instant::now();
+        // Whenever observe_guard gets called, or dropped,
+        // it adds the time elapsed since its last call to metrics.
+        // Last call is tracked in `now`.
+        let mut observe_guard = scopeguard::guard(
+            || {
+                let elapsed = now - self.base;
+                self.global_micros
+                    .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
+                self.per_timeline_micros
+                    .inc_by(u64::try_from(elapsed.as_micros()).unwrap());
+                self.base = now;
+            },
+            |mut observe| {
+                observe();
+            },
+        );
+
+        loop {
+            match tokio::time::timeout(Duration::from_secs(10), &mut fut).await {
+                Ok(v) => return v,
+                Err(_timeout) => {
+                    (*observe_guard)();
+                }
+            }
+        }
     }
 }
 
@@ -1302,6 +1375,8 @@ pub(crate) struct SmgrQueryTimePerTimeline {
     per_timeline_getpage_latency: Histogram,
     global_batch_size: Histogram,
     per_timeline_batch_size: Histogram,
+    global_flush_in_progress_micros: IntCounter,
+    per_timeline_flush_in_progress_micros: IntCounter,
 }
 
 static SMGR_QUERY_STARTED_GLOBAL: Lazy<IntCounterVec> = Lazy::new(|| {
@@ -1464,6 +1539,26 @@ fn set_page_service_config_max_batch_size(conf: &PageServicePipeliningConfig) {
         .set(value.try_into().unwrap());
 }
 
+static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_page_service_pagestream_flush_in_progress_micros",
+        "Counter that sums up the microseconds that a pagestream response was being flushed into the TCP connection. \
+         If the flush is particularly slow, this counter will be updated periodically to make slow flushes \
+         easily discoverable in monitoring. \
+         Hence, this is NOT a completion latency historgram.",
+        &["tenant_id", "shard_id", "timeline_id"],
+    )
+    .expect("failed to define a metric")
+});
+
+static PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL: Lazy<IntCounter> = Lazy::new(|| {
+    register_int_counter!(
+        "pageserver_page_service_pagestream_flush_in_progress_micros_global",
+        "Like pageserver_page_service_pagestream_flush_in_progress_seconds, but instance-wide.",
+    )
+    .expect("failed to define a metric")
+});
+
 impl SmgrQueryTimePerTimeline {
     pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
         let tenant_id = tenant_shard_id.tenant_id.to_string();
@@ -1504,6 +1599,12 @@ impl SmgrQueryTimePerTimeline {
             .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
             .unwrap();
 
+        let global_flush_in_progress_micros =
+            PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS_GLOBAL.clone();
+        let per_timeline_flush_in_progress_micros = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS
+            .get_metric_with_label_values(&[&tenant_id, &shard_slug, &timeline_id])
+            .unwrap();
+
         Self {
             global_started,
             global_latency,
@@ -1511,6 +1612,8 @@ impl SmgrQueryTimePerTimeline {
             per_timeline_getpage_started,
             global_batch_size,
             per_timeline_batch_size,
+            global_flush_in_progress_micros,
+            per_timeline_flush_in_progress_micros,
         }
     }
     pub(crate) fn start_smgr_op(&self, op: SmgrQueryType, started_at: Instant) -> SmgrOpTimer {
@@ -1523,13 +1626,17 @@ impl SmgrQueryTimePerTimeline {
             None
         };
 
-        SmgrOpTimer {
+        SmgrOpTimer(Some(SmgrOpTimerInner {
             global_latency_histo: self.global_latency[op as usize].clone(),
             per_timeline_latency_histo,
             start: started_at,
             op,
             throttled: Duration::ZERO,
-        }
+            global_flush_in_progress_micros: self.global_flush_in_progress_micros.clone(),
+            per_timeline_flush_in_progress_micros: self
+                .per_timeline_flush_in_progress_micros
+                .clone(),
+        }))
     }
 
     pub(crate) fn observe_getpage_batch_start(&self, batch_size: usize) {
@@ -2777,6 +2884,11 @@ impl TimelineMetrics {
             shard_id,
             timeline_id,
         ]);
+        let _ = PAGE_SERVICE_SMGR_FLUSH_INPROGRESS_MICROS.remove_label_values(&[
+            tenant_id,
+            shard_id,
+            timeline_id,
+        ]);
     }
 }
 
diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index 7026df952751..97d94bbe7f33 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -1017,10 +1017,8 @@ impl PageServerHandler {
         // Map handler result to protocol behavior.
         // Some handler errors cause exit from pagestream protocol.
         // Other handler errors are sent back as an error message and we stay in pagestream protocol.
-        let mut timers: smallvec::SmallVec<[_; 1]> =
-            smallvec::SmallVec::with_capacity(handler_results.len());
         for handler_result in handler_results {
-            let response_msg = match handler_result {
+            let (response_msg, timer) = match handler_result {
                 Err(e) => match &e {
                     PageStreamError::Shutdown => {
                         // If we fail to fulfil a request during shutdown, which may be _because_ of
@@ -1044,34 +1042,66 @@ impl PageServerHandler {
                         span.in_scope(|| {
                             error!("error reading relation or page version: {full:#}")
                         });
-                        PagestreamBeMessage::Error(PagestreamErrorResponse {
-                            message: e.to_string(),
-                        })
+                        (
+                            PagestreamBeMessage::Error(PagestreamErrorResponse {
+                                message: e.to_string(),
+                            }),
+                            None, // TODO: measure errors
+                        )
                     }
                 },
-                Ok((response_msg, timer)) => {
-                    // Extending the lifetime of the timers so observations on drop
-                    // include the flush time.
-                    timers.push(timer);
-                    response_msg
-                }
+                Ok((response_msg, timer)) => (response_msg, Some(timer)),
             };
 
+            //
             // marshal & transmit response message
+            //
+
             pgb_writer.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
-        }
-        tokio::select! {
-            biased;
-            _ = cancel.cancelled() => {
-                // We were requested to shut down.
-                info!("shutdown request received in page handler");
-                return Err(QueryError::Shutdown)
-            }
-            res = pgb_writer.flush() => {
-                res?;
+
+            // We purposefully don't count flush time into the timer.
+            //
+            // The reason is that current compute client will not perform protocol processing
+            // if the postgres backend process is doing things other than `->smgr_read()`.
+            // This is especially the case for prefetch.
+            //
+            // If the compute doesn't read from the connection, eventually TCP will backpressure
+            // all the way into our flush call below.
+            //
+            // The timer's underlying metric is used for a storage-internal latency SLO and
+            // we don't want to include latency in it that we can't control.
+            // And as pointed out above, in this case, we don't control the time that flush will take.
+            let flushing_timer =
+                timer.map(|timer| timer.observe_smgr_op_completion_and_start_flushing());
+
+            // what we want to do
+            let flush_fut = pgb_writer.flush();
+            // metric for how long flushing takes
+            let flush_fut = match flushing_timer {
+                Some(flushing_timer) => {
+                    futures::future::Either::Left(flushing_timer.measure(flush_fut))
+                }
+                None => futures::future::Either::Right(flush_fut),
+            };
+            // do it while respecting cancellation
+            let _: () = async move {
+                tokio::select! {
+                    biased;
+                    _ = cancel.cancelled() => {
+                        // We were requested to shut down.
+                        info!("shutdown request received in page handler");
+                        return Err(QueryError::Shutdown)
+                    }
+                    res = flush_fut => {
+                        res?;
+                    }
+                }
+                Ok(())
             }
+            // and log the info! line inside the request span
+            .instrument(span.clone())
+            .await?;
         }
-        drop(timers);
         Ok(())
     }
 
diff --git a/test_runner/fixtures/metrics.py b/test_runner/fixtures/metrics.py
index 52ed7da36b3b..a591e088eff7 100644
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -176,6 +176,7 @@ def counter(name: str) -> str:
     counter("pageserver_tenant_throttling_wait_usecs_sum"),
     counter("pageserver_tenant_throttling_count"),
     counter("pageserver_timeline_wal_records_received"),
+    counter("pageserver_page_service_pagestream_flush_in_progress_micros"),
     *histogram("pageserver_page_service_batch_size"),
     *PAGESERVER_PER_TENANT_REMOTE_TIMELINE_CLIENT_METRICS,
     # "pageserver_directory_entries_count", -- only used if above a certain threshold

From ec790870d54aadd1ecc6e431c9049b489ba33cd1 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Sat, 7 Dec 2024 13:05:09 +0000
Subject: [PATCH 64/65] storcon: automatically clear Pause/Stop scheduling
 policies to enable detaches (#10011)

## Problem

We saw a tenant get stuck when it had been put into Pause scheduling
mode to pin it to a pageserver, then it was left idle for a while and
the control plane tried to detach it.

Close: https://github.com/neondatabase/neon/issues/9957

## Summary of changes

- When changing policy to Detached or Secondary, set the scheduling
policy to Active.
- Add a test that exercises this
- When persisting tenant shards, set their `generation_pageserver` to
null if the placement policy is not Attached (this enables consistency
checks to work, and avoids leaving state in the DB that could be
confusing/misleading in future)
---
 libs/pageserver_api/src/controller_api.rs     | 11 ++++
 storage_controller/src/persistence.rs         |  9 ++++
 storage_controller/src/service.rs             | 39 +++++++++++++-
 .../regress/test_storage_controller.py        | 52 +++++++++++++++++++
 4 files changed, 109 insertions(+), 2 deletions(-)

diff --git a/libs/pageserver_api/src/controller_api.rs b/libs/pageserver_api/src/controller_api.rs
index 9a5ebc95bdd3..6839ef69f592 100644
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -245,6 +245,17 @@ impl From<NodeAvailability> for NodeAvailabilityWrapper {
     }
 }
 
+/// Scheduling policy enables us to selectively disable some automatic actions that the
+/// controller performs on a tenant shard. This is only set to a non-default value by
+/// human intervention, and it is reset to the default value (Active) when the tenant's
+/// placement policy is modified away from Attached.
+///
+/// The typical use of a non-Active scheduling policy is one of:
+/// - Pinnning a shard to a node (i.e. migrating it there & setting a non-Active scheduling policy)
+/// - Working around a bug (e.g. if something is flapping and we need to stop it until the bug is fixed)
+///
+/// If you're not sure which policy to use to pin a shard to its current location, you probably
+/// want Pause.
 #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
 pub enum ShardSchedulingPolicy {
     // Normal mode: the tenant's scheduled locations may be updated at will, including
diff --git a/storage_controller/src/persistence.rs b/storage_controller/src/persistence.rs
index 14cc51240d10..7ca80c7dfeec 100644
--- a/storage_controller/src/persistence.rs
+++ b/storage_controller/src/persistence.rs
@@ -636,6 +636,13 @@ impl Persistence {
                     .into_boxed(),
             };
 
+            // Clear generation_pageserver if we are moving into a state where we won't have
+            // any attached pageservers.
+            let input_generation_pageserver = match input_placement_policy {
+                None | Some(PlacementPolicy::Attached(_)) => None,
+                Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) => Some(None),
+            };
+
             #[derive(AsChangeset)]
             #[diesel(table_name = crate::schema::tenant_shards)]
             struct ShardUpdate {
@@ -643,6 +650,7 @@ impl Persistence {
                 placement_policy: Option<String>,
                 config: Option<String>,
                 scheduling_policy: Option<String>,
+                generation_pageserver: Option<Option<i64>>,
             }
 
             let update = ShardUpdate {
@@ -655,6 +663,7 @@ impl Persistence {
                     .map(|c| serde_json::to_string(&c).unwrap()),
                 scheduling_policy: input_scheduling_policy
                     .map(|p| serde_json::to_string(&p).unwrap()),
+                generation_pageserver: input_generation_pageserver,
             };
 
             query.set(update).execute(conn)?;
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index 083c78233a8a..7e4ee53b4cbf 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -513,6 +513,9 @@ struct ShardUpdate {
 
     /// If this is None, generation is not updated.
     generation: Option<Generation>,
+
+    /// If this is None, scheduling policy is not updated.
+    scheduling_policy: Option<ShardSchedulingPolicy>,
 }
 
 enum StopReconciliationsReason {
@@ -2376,6 +2379,23 @@ impl Service {
             }
         };
 
+        // Ordinarily we do not update scheduling policy, but when making major changes
+        // like detaching or demoting to secondary-only, we need to force the scheduling
+        // mode to Active, or the caller's expected outcome (detach it) will not happen.
+        let scheduling_policy = match req.config.mode {
+            LocationConfigMode::Detached | LocationConfigMode::Secondary => {
+                // Special case: when making major changes like detaching or demoting to secondary-only,
+                // we need to force the scheduling mode to Active, or nothing will happen.
+                Some(ShardSchedulingPolicy::Active)
+            }
+            LocationConfigMode::AttachedMulti
+            | LocationConfigMode::AttachedSingle
+            | LocationConfigMode::AttachedStale => {
+                // While attached, continue to respect whatever the existing scheduling mode is.
+                None
+            }
+        };
+
         let mut create = true;
         for (shard_id, shard) in tenants.range_mut(TenantShardId::tenant_range(tenant_id)) {
             // Saw an existing shard: this is not a creation
@@ -2401,6 +2421,7 @@ impl Service {
                 placement_policy: placement_policy.clone(),
                 tenant_config: req.config.tenant_conf.clone(),
                 generation: set_generation,
+                scheduling_policy,
             });
         }
 
@@ -2497,6 +2518,7 @@ impl Service {
                     placement_policy,
                     tenant_config,
                     generation,
+                    scheduling_policy,
                 } in &updates
                 {
                     self.persistence
@@ -2505,7 +2527,7 @@ impl Service {
                             Some(placement_policy.clone()),
                             Some(tenant_config.clone()),
                             *generation,
-                            None,
+                            *scheduling_policy,
                         )
                         .await?;
                 }
@@ -2521,6 +2543,7 @@ impl Service {
                         placement_policy,
                         tenant_config,
                         generation: update_generation,
+                        scheduling_policy,
                     } in updates
                     {
                         let Some(shard) = tenants.get_mut(&tenant_shard_id) else {
@@ -2539,6 +2562,10 @@ impl Service {
                             shard.generation = Some(generation);
                         }
 
+                        if let Some(scheduling_policy) = scheduling_policy {
+                            shard.set_scheduling_policy(scheduling_policy);
+                        }
+
                         shard.schedule(scheduler, &mut schedule_context)?;
 
                         let maybe_waiter = self.maybe_reconcile_shard(shard, nodes);
@@ -2992,9 +3019,17 @@ impl Service {
 
         let TenantPolicyRequest {
             placement,
-            scheduling,
+            mut scheduling,
         } = req;
 
+        if let Some(PlacementPolicy::Detached | PlacementPolicy::Secondary) = placement {
+            // When someone configures a tenant to detach, we force the scheduling policy to enable
+            // this to take effect.
+            if scheduling.is_none() {
+                scheduling = Some(ShardSchedulingPolicy::Active);
+            }
+        }
+
         self.persistence
             .update_tenant_shard(
                 TenantFilter::Tenant(tenant_id),
diff --git a/test_runner/regress/test_storage_controller.py b/test_runner/regress/test_storage_controller.py
index f878116d533d..9f74dcccb99e 100644
--- a/test_runner/regress/test_storage_controller.py
+++ b/test_runner/regress/test_storage_controller.py
@@ -3230,3 +3230,55 @@ def has_hit_migration_failpoint():
         # Always disable 'pause' failpoints, even on failure, to avoid hanging in shutdown
         env.storage_controller.configure_failpoints((migration_failpoint.value, "off"))
         raise
+
+
+@run_only_on_default_postgres("Postgres version makes no difference here")
+def test_storage_controller_detached_stopped(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    Test that detaching a tenant while it has scheduling policy set to Paused or Stop works
+    """
+
+    remote_storage_kind = s3_storage()
+    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)
+
+    neon_env_builder.num_pageservers = 1
+
+    env = neon_env_builder.init_configs()
+    env.start()
+    virtual_ps_http = PageserverHttpClient(env.storage_controller_port, lambda: True)
+
+    tenant_id = TenantId.generate()
+    env.storage_controller.tenant_create(
+        tenant_id,
+        shard_count=1,
+    )
+
+    assert len(env.pageserver.http_client().tenant_list_locations()["tenant_shards"]) == 1
+
+    # Disable scheduling: ordinarily this would prevent the tenant's configuration being
+    # reconciled to pageservers, but this should be overridden when detaching.
+    env.storage_controller.allowed_errors.append(".*Scheduling is disabled by policy.*")
+    env.storage_controller.tenant_policy_update(
+        tenant_id,
+        {"scheduling": "Stop"},
+    )
+
+    env.storage_controller.consistency_check()
+
+    # Detach the tenant
+    virtual_ps_http.tenant_location_conf(
+        tenant_id,
+        {
+            "mode": "Detached",
+            "secondary_conf": None,
+            "tenant_conf": {},
+            "generation": None,
+        },
+    )
+
+    env.storage_controller.consistency_check()
+
+    # Confirm the detach happened
+    assert env.pageserver.http_client().tenant_list_locations()["tenant_shards"] == []

From 6c349e76d9aa223eb31bf925609f1a1adaabe6dc Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 9 Dec 2024 06:05:40 +0000
Subject: [PATCH 65/65] Storage release 2024-12-09