Skip to content

Commit

Permalink
pageserver: assert that keys belong to shard (#9943)
Browse files Browse the repository at this point in the history
We've seen cases where stray keys end up on the wrong shard. This
shouldn't happen. Add debug assertions to prevent this. In release
builds, we should be lenient in order to handle changing key ownership
policies.

Touches #9914.
  • Loading branch information
erikgrinaker authored Dec 6, 2024
1 parent 3f1c542 commit 7838659
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 8 deletions.
5 changes: 3 additions & 2 deletions libs/pageserver_api/src/shard.rs
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,8 @@ impl ShardIdentity {
key_to_shard_number(self.count, self.stripe_size, key)
}

/// Return true if the key should be ingested by this shard
/// Return true if the key is stored only on this shard. This does not include
/// global keys, see is_key_global().
///
/// Shards must ingest _at least_ keys which return true from this check.
pub fn is_key_local(&self, key: &Key) -> bool {
Expand All @@ -171,7 +172,7 @@ impl ShardIdentity {
}

/// Return true if the key should be stored on all shards, not just one.
fn is_key_global(&self, key: &Key) -> bool {
pub fn is_key_global(&self, key: &Key) -> bool {
if key.is_slru_block_key() || key.is_slru_segment_size_key() || key.is_aux_file_key() {
// Special keys that are only stored on shard 0
false
Expand Down
6 changes: 6 additions & 0 deletions libs/utils/src/shard.rs
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,12 @@ impl TenantShardId {
}
}

impl std::fmt::Display for ShardNumber {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
self.0.fmt(f)
}
}

impl std::fmt::Display for ShardSlug<'_> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
write!(
Expand Down
19 changes: 18 additions & 1 deletion pageserver/src/tenant/timeline.rs
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ use utils::{
postgres_client::PostgresClientProtocol,
sync::gate::{Gate, GateGuard},
};
use wal_decoder::serialized_batch::SerializedValueBatch;
use wal_decoder::serialized_batch::{SerializedValueBatch, ValueMeta};

use std::sync::atomic::Ordering as AtomicOrdering;
use std::sync::{Arc, Mutex, RwLock, Weak};
Expand Down Expand Up @@ -5924,6 +5924,23 @@ impl<'a> TimelineWriter<'a> {
return Ok(());
}

// In debug builds, assert that we don't write any keys that don't belong to this shard.
// We don't assert this in release builds, since key ownership policies may change over
// time. Stray keys will be removed during compaction.
if cfg!(debug_assertions) {
for metadata in &batch.metadata {
if let ValueMeta::Serialized(metadata) = metadata {
let key = Key::from_compact(metadata.key);
assert!(
self.shard_identity.is_key_local(&key)
|| self.shard_identity.is_key_global(&key),
"key {key} does not belong on shard {}",
self.shard_identity.shard_index()
);
}
}
}

let batch_max_lsn = batch.max_lsn;
let buf_size: u64 = batch.buffer_size() as u64;

Expand Down
16 changes: 11 additions & 5 deletions pageserver/src/tenant/timeline/compaction.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1179,11 +1179,12 @@ impl Timeline {
.await
.map_err(CompactionError::Other)?;
} else {
debug!(
"Dropping key {} during compaction (it belongs on shard {:?})",
key,
self.shard_identity.get_shard_number(&key)
);
let shard = self.shard_identity.shard_index();
let owner = self.shard_identity.get_shard_number(&key);
if cfg!(debug_assertions) {
panic!("key {key} does not belong on shard {shard}, owned by {owner}");
}
debug!("dropping key {key} during compaction (it belongs on shard {owner})");
}

if !new_layers.is_empty() {
Expand Down Expand Up @@ -2054,6 +2055,11 @@ impl Timeline {
// This is not handled in the filter iterator because shard is determined by hash.
// Therefore, it does not give us any performance benefit to do things like skip
// a whole layer file as handling key spaces (ranges).
if cfg!(debug_assertions) {
let shard = self.shard_identity.shard_index();
let owner = self.shard_identity.get_shard_number(&key);
panic!("key {key} does not belong on shard {shard}, owned by {owner}");
}
continue;
}
if !job_desc.compaction_key_range.contains(&key) {
Expand Down

1 comment on commit 7838659

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

7166 tests run: 6847 passed, 0 failed, 319 skipped (full report)


Flaky tests (4)

Postgres 17

Postgres 14

Code coverage* (full report)

  • functions: 31.4% (8327 of 26507 functions)
  • lines: 47.8% (65501 of 137095 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
7838659 at 2024-12-06T12:06:48.822Z :recycle:

Please sign in to comment.