Skip to content

Commit

Permalink
pageserver: add metrics for unknown ClearVmBits pages (#9911)
Browse files Browse the repository at this point in the history
## Problem

When ingesting implicit `ClearVmBits` operations, we silently drop the
writes if the relation or page is unknown. There are implicit
assumptions around VM pages wrt. explicit/implicit updates, sharding,
and relation sizes, which can possibly drop writes incorrectly. Adding a
few metrics will allow us to investigate further and tighten up the
logic.

Touches #9855.

## Summary of changes

Add a `pageserver_wal_ingest_clear_vm_bits_unknown` metric to record
dropped `ClearVmBits` writes.

Also add comments clarifying the behavior of relation sizes on non-zero
shards.
  • Loading branch information
erikgrinaker committed Nov 29, 2024
1 parent 84705f7 commit ef47535
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 16 deletions.
7 changes: 7 additions & 0 deletions pageserver/src/metrics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2135,6 +2135,7 @@ pub(crate) struct WalIngestMetrics {
pub(crate) records_committed: IntCounter,
pub(crate) records_filtered: IntCounter,
pub(crate) gap_blocks_zeroed_on_rel_extend: IntCounter,
pub(crate) clear_vm_bits_unknown: IntCounterVec,
}

pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
Expand Down Expand Up @@ -2163,6 +2164,12 @@ pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMet
"Total number of zero gap blocks written on relation extends"
)
.expect("failed to define a metric"),
clear_vm_bits_unknown: register_int_counter_vec!(
"pageserver_wal_ingest_clear_vm_bits_unknown",
"Number of ignored ClearVmBits operations due to unknown pages/relations",
&["entity"],
)
.expect("failed to define a metric"),
});

pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
Expand Down
17 changes: 14 additions & 3 deletions pageserver/src/pgdatadir_mapping.rs
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,9 @@ impl Timeline {
result
}

// Get size of a database in blocks
/// Get size of a database in blocks. This is only accurate on shard 0. It will undercount on
/// other shards, by only accounting for relations the shard has pages for, and only accounting
/// for pages up to the highest page number it has stored.
pub(crate) async fn get_db_size(
&self,
spcnode: Oid,
Expand All @@ -411,7 +413,10 @@ impl Timeline {
Ok(total_blocks)
}

/// Get size of a relation file
/// Get size of a relation file. The relation must exist, otherwise an error is returned.
///
/// This is only accurate on shard 0. On other shards, it will return the size up to the highest
/// page number stored in the shard.
pub(crate) async fn get_rel_size(
&self,
tag: RelTag,
Expand Down Expand Up @@ -447,7 +452,10 @@ impl Timeline {
Ok(nblocks)
}

/// Does relation exist?
/// Does the relation exist?
///
/// Only shard 0 has a full view of the relations. Other shards only know about relations that
/// the shard stores pages for.
pub(crate) async fn get_rel_exists(
&self,
tag: RelTag,
Expand Down Expand Up @@ -481,6 +489,9 @@ impl Timeline {

/// Get a list of all existing relations in given tablespace and database.
///
/// Only shard 0 has a full view of the relations. Other shards only know about relations that
/// the shard stores pages for.
///
/// # Cancel-Safety
///
/// This method is cancellation-safe.
Expand Down
49 changes: 36 additions & 13 deletions pageserver/src/walingest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -334,14 +334,32 @@ impl WalIngest {
// replaying it would fail to find the previous image of the page, because
// it doesn't exist. So check if the VM page(s) exist, and skip the WAL
// record if it doesn't.
let vm_size = get_relsize(modification, vm_rel, ctx).await?;
//
// TODO: analyze the metrics and tighten this up accordingly. This logic
// implicitly assumes that VM pages see explicit WAL writes before
// implicit ClearVmBits, and will otherwise silently drop updates.
let Some(vm_size) = get_relsize(modification, vm_rel, ctx).await? else {
WAL_INGEST
.clear_vm_bits_unknown
.with_label_values(&["relation"])
.inc();
return Ok(());
};
if let Some(blknum) = new_vm_blk {
if blknum >= vm_size {
WAL_INGEST
.clear_vm_bits_unknown
.with_label_values(&["new_page"])
.inc();
new_vm_blk = None;
}
}
if let Some(blknum) = old_vm_blk {
if blknum >= vm_size {
WAL_INGEST
.clear_vm_bits_unknown
.with_label_values(&["old_page"])
.inc();
old_vm_blk = None;
}
}
Expand Down Expand Up @@ -572,7 +590,8 @@ impl WalIngest {
modification.put_rel_page_image_zero(rel, fsm_physical_page_no)?;
fsm_physical_page_no += 1;
}
let nblocks = get_relsize(modification, rel, ctx).await?;
// TODO: re-examine the None case here wrt. sharding; should we error?
let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0);
if nblocks > fsm_physical_page_no {
// check if something to do: FSM is larger than truncate position
self.put_rel_truncation(modification, rel, fsm_physical_page_no, ctx)
Expand Down Expand Up @@ -612,7 +631,8 @@ impl WalIngest {
)?;
vm_page_no += 1;
}
let nblocks = get_relsize(modification, rel, ctx).await?;
// TODO: re-examine the None case here wrt. sharding; should we error?
let nblocks = get_relsize(modification, rel, ctx).await?.unwrap_or(0);
if nblocks > vm_page_no {
// check if something to do: VM is larger than truncate position
self.put_rel_truncation(modification, rel, vm_page_no, ctx)
Expand Down Expand Up @@ -1430,24 +1450,27 @@ impl WalIngest {
}
}

/// Returns the size of the relation as of this modification, or None if the relation doesn't exist.
///
/// This is only accurate on shard 0. On other shards, it will return the size up to the highest
/// page number stored in the shard, or None if the shard does not have any pages for it.
async fn get_relsize(
modification: &DatadirModification<'_>,
rel: RelTag,
ctx: &RequestContext,
) -> Result<BlockNumber, PageReconstructError> {
let nblocks = if !modification
) -> Result<Option<BlockNumber>, PageReconstructError> {
if !modification
.tline
.get_rel_exists(rel, Version::Modified(modification), ctx)
.await?
{
0
} else {
modification
.tline
.get_rel_size(rel, Version::Modified(modification), ctx)
.await?
};
Ok(nblocks)
return Ok(None);
}
modification
.tline
.get_rel_size(rel, Version::Modified(modification), ctx)
.await
.map(Some)
}

#[allow(clippy::bool_assert_comparison)]
Expand Down

0 comments on commit ef47535

Please sign in to comment.