Skip to content

Commit

Permalink
pageserver: only apply ClearVmBits on relevant shards (#9895)
Browse files Browse the repository at this point in the history
# Problem

VM (visibility map) pages are stored and managed as any regular relation
page, in the VM fork of the main relation. They are also sharded like
other pages. Regular WAL writes to the VM pages (typically performed by
vacuum) are routed to the correct shard as usual. However, VM pages are
also updated via `ClearVmBits` metadata records emitted when main
relation pages are updated. These metadata records were sent to all
shards, like other metadata records. This had the following effects:

* On shards responsible for VM pages, the `ClearVmBits` applies as
expected.

* On shard 0, which knows about the VM relation and its size but doesn't
necessarily have any VM pages, the `ClearVmBits` writes may have been
applied without also having applied the explicit WAL writes to VM pages.

* If VM pages are spread across multiple shards (unlikely with 256MB
stripe size), all shards may have applied `ClearVmBits` if the pages
fall within their local view of the relation size, even for pages they
do not own.

* On other shards, this caused a relation size cache miss and a DbDir
and RelDir lookup before dropping the `ClearVmBits`. With many
relations, this could cause significant CPU overhead.

This is not believed to be a correctness problem, but this will be
verified in #9914.

Resolves #9855.

# Changes

Route `ClearVmBits` metadata records only to the shards responsible for
the VM pages.

Verification of the current VM handling and cleanup of incomplete VM
pages on shard 0 (and potentially elsewhere) is left as follow-up work.
  • Loading branch information
erikgrinaker authored Nov 27, 2024
1 parent 9e3cb75 commit da1daa2
Showing 1 changed file with 54 additions and 17 deletions.
71 changes: 54 additions & 17 deletions libs/wal_decoder/src/decoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
use crate::models::*;
use crate::serialized_batch::SerializedValueBatch;
use bytes::{Buf, Bytes};
use pageserver_api::key::rel_block_to_key;
use pageserver_api::reltag::{RelTag, SlruKind};
use pageserver_api::shard::ShardIdentity;
use postgres_ffi::pg_constants;
Expand Down Expand Up @@ -32,7 +33,8 @@ impl InterpretedWalRecord {
FlushUncommittedRecords::No
};

let metadata_record = MetadataRecord::from_decoded(&decoded, next_record_lsn, pg_version)?;
let metadata_record =
MetadataRecord::from_decoded_filtered(&decoded, shard, next_record_lsn, pg_version)?;
let batch = SerializedValueBatch::from_decoded_filtered(
decoded,
shard,
Expand All @@ -51,8 +53,13 @@ impl InterpretedWalRecord {
}

impl MetadataRecord {
fn from_decoded(
/// Builds a metadata record for this WAL record, if any.
///
/// Only metadata records relevant for the given shard are emitted. Currently, most metadata
/// records are broadcast to all shards for simplicity, but this should be improved.
fn from_decoded_filtered(
decoded: &DecodedWALRecord,
shard: &ShardIdentity,
next_record_lsn: Lsn,
pg_version: u32,
) -> anyhow::Result<Option<MetadataRecord>> {
Expand All @@ -61,26 +68,27 @@ impl MetadataRecord {
let mut buf = decoded.record.clone();
buf.advance(decoded.main_data_offset);

match decoded.xl_rmid {
// First, generate metadata records from the decoded WAL record.
let mut metadata_record = match decoded.xl_rmid {
pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
Self::decode_heapam_record(&mut buf, decoded, pg_version)
Self::decode_heapam_record(&mut buf, decoded, pg_version)?
}
pg_constants::RM_NEON_ID => Self::decode_neonmgr_record(&mut buf, decoded, pg_version),
pg_constants::RM_NEON_ID => Self::decode_neonmgr_record(&mut buf, decoded, pg_version)?,
// Handle other special record types
pg_constants::RM_SMGR_ID => Self::decode_smgr_record(&mut buf, decoded),
pg_constants::RM_DBASE_ID => Self::decode_dbase_record(&mut buf, decoded, pg_version),
pg_constants::RM_SMGR_ID => Self::decode_smgr_record(&mut buf, decoded)?,
pg_constants::RM_DBASE_ID => Self::decode_dbase_record(&mut buf, decoded, pg_version)?,
pg_constants::RM_TBLSPC_ID => {
tracing::trace!("XLOG_TBLSPC_CREATE/DROP is not handled yet");
Ok(None)
None
}
pg_constants::RM_CLOG_ID => Self::decode_clog_record(&mut buf, decoded, pg_version),
pg_constants::RM_CLOG_ID => Self::decode_clog_record(&mut buf, decoded, pg_version)?,
pg_constants::RM_XACT_ID => {
Self::decode_xact_record(&mut buf, decoded, next_record_lsn)
Self::decode_xact_record(&mut buf, decoded, next_record_lsn)?
}
pg_constants::RM_MULTIXACT_ID => {
Self::decode_multixact_record(&mut buf, decoded, pg_version)
Self::decode_multixact_record(&mut buf, decoded, pg_version)?
}
pg_constants::RM_RELMAP_ID => Self::decode_relmap_record(&mut buf, decoded),
pg_constants::RM_RELMAP_ID => Self::decode_relmap_record(&mut buf, decoded)?,
// This is an odd duck. It needs to go to all shards.
// Since it uses the checkpoint image (that's initialized from CHECKPOINT_KEY
// in WalIngest::new), we have to send the whole DecodedWalRecord::record to
Expand All @@ -89,19 +97,48 @@ impl MetadataRecord {
// Alternatively, one can make the checkpoint part of the subscription protocol
// to the pageserver. This should work fine, but can be done at a later point.
pg_constants::RM_XLOG_ID => {
Self::decode_xlog_record(&mut buf, decoded, next_record_lsn)
Self::decode_xlog_record(&mut buf, decoded, next_record_lsn)?
}
pg_constants::RM_LOGICALMSG_ID => {
Self::decode_logical_message_record(&mut buf, decoded)
Self::decode_logical_message_record(&mut buf, decoded)?
}
pg_constants::RM_STANDBY_ID => Self::decode_standby_record(&mut buf, decoded),
pg_constants::RM_REPLORIGIN_ID => Self::decode_replorigin_record(&mut buf, decoded),
pg_constants::RM_STANDBY_ID => Self::decode_standby_record(&mut buf, decoded)?,
pg_constants::RM_REPLORIGIN_ID => Self::decode_replorigin_record(&mut buf, decoded)?,
_unexpected => {
// TODO: consider failing here instead of blindly doing something without
// understanding the protocol
Ok(None)
None
}
};

// Next, filter the metadata record by shard.

// Route VM page updates to the shards that own them. VM pages are stored in the VM fork
// of the main relation. These are sharded and managed just like regular relation pages.
// See: https://github.com/neondatabase/neon/issues/9855
if let Some(
MetadataRecord::Heapam(HeapamRecord::ClearVmBits(ref mut clear_vm_bits))
| MetadataRecord::Neonrmgr(NeonrmgrRecord::ClearVmBits(ref mut clear_vm_bits)),
) = metadata_record
{
let is_local_vm_page = |heap_blk| {
let vm_blk = pg_constants::HEAPBLK_TO_MAPBLOCK(heap_blk);
shard.is_key_local(&rel_block_to_key(clear_vm_bits.vm_rel, vm_blk))
};
// Send the old and new VM page updates to their respective shards.
clear_vm_bits.old_heap_blkno = clear_vm_bits
.old_heap_blkno
.filter(|&blkno| is_local_vm_page(blkno));
clear_vm_bits.new_heap_blkno = clear_vm_bits
.new_heap_blkno
.filter(|&blkno| is_local_vm_page(blkno));
// If neither VM page belongs to this shard, discard the record.
if clear_vm_bits.old_heap_blkno.is_none() && clear_vm_bits.new_heap_blkno.is_none() {
metadata_record = None
}
}

Ok(metadata_record)
}

fn decode_heapam_record(
Expand Down

1 comment on commit da1daa2

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

7053 tests run: 6719 passed, 1 failed, 333 skipped (full report)


Failures on Postgres 16

  • test_sharded_ingest[github-actions-selfhosted-vanilla-1]: release-x86-64
# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_sharded_ingest[release-pg16-github-actions-selfhosted-vanilla-1]"
Flaky tests (3)

Postgres 17

Postgres 14

Code coverage* (full report)

  • functions: 30.6% (7982 of 26064 functions)
  • lines: 48.6% (63390 of 130514 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
da1daa2 at 2024-11-27T21:28:33.661Z :recycle:

Please sign in to comment.