Skip to content

Commit

Permalink
pageserver: respect no_sync in VirtualFile (#9772)
Browse files Browse the repository at this point in the history
## Problem

`no_sync` initially just skipped syncfs on startup (#9677). I'm also
interested in flaky tests that time out during pageserver shutdown while
flushing l0s, so to eliminate disk throughput as a source of issues
there,

## Summary of changes

- Drive-by change for test timeouts: add a couple more ::info logs
during pageserver startup so it's obvious which part got stuck.
- Add a SyncMode enum to configure VirtualFile and respect it in
sync_all and sync_data functions
- During pageserver startup, set SyncMode according to `no_sync`
  • Loading branch information
jcsp authored Nov 18, 2024
1 parent b6154b0 commit 261d065
Show file tree
Hide file tree
Showing 6 changed files with 45 additions and 1 deletion.
1 change: 1 addition & 0 deletions pageserver/benches/bench_ingest.rs
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ fn criterion_benchmark(c: &mut Criterion) {
16384,
virtual_file::io_engine_for_bench(),
conf.virtual_file_io_mode,
virtual_file::SyncMode::Sync,
);
page_cache::init(conf.page_cache_size);

Expand Down
1 change: 1 addition & 0 deletions pageserver/ctl/src/layer_map_analyzer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
10,
virtual_file::api::IoEngineKind::StdFs,
IoMode::preferred(),
virtual_file::SyncMode::Sync,
);
pageserver::page_cache::init(100);

Expand Down
3 changes: 3 additions & 0 deletions pageserver/ctl/src/layers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
10,
virtual_file::api::IoEngineKind::StdFs,
IoMode::preferred(),
virtual_file::SyncMode::Sync,
);
page_cache::init(100);
let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
Expand All @@ -65,6 +66,7 @@ async fn read_image_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result
10,
virtual_file::api::IoEngineKind::StdFs,
IoMode::preferred(),
virtual_file::SyncMode::Sync,
);
page_cache::init(100);
let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
Expand Down Expand Up @@ -171,6 +173,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
10,
virtual_file::api::IoEngineKind::StdFs,
IoMode::preferred(),
virtual_file::SyncMode::Sync,
);
pageserver::page_cache::init(100);

Expand Down
1 change: 1 addition & 0 deletions pageserver/ctl/src/main.rs
Original file line number Diff line number Diff line change
Expand Up @@ -209,6 +209,7 @@ async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
10,
virtual_file::api::IoEngineKind::StdFs,
IoMode::preferred(),
virtual_file::SyncMode::Sync,
);
page_cache::init(100);
let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
Expand Down
7 changes: 7 additions & 0 deletions pageserver/src/bin/pageserver.rs
Original file line number Diff line number Diff line change
Expand Up @@ -171,11 +171,18 @@ fn main() -> anyhow::Result<()> {
let scenario = failpoint_support::init();

// Basic initialization of things that don't change after startup
tracing::info!("Initializing virtual_file...");
virtual_file::init(
conf.max_file_descriptors,
conf.virtual_file_io_engine,
conf.virtual_file_io_mode,
if conf.no_sync {
virtual_file::SyncMode::UnsafeNoSync
} else {
virtual_file::SyncMode::Sync
},
);
tracing::info!("Initializing page_cache...");
page_cache::init(conf.page_cache_size);

start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;
Expand Down
33 changes: 32 additions & 1 deletion pageserver/src/virtual_file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -175,10 +175,16 @@ impl VirtualFile {
}

pub async fn sync_all(&self) -> Result<(), Error> {
if SYNC_MODE.load(std::sync::atomic::Ordering::Relaxed) == SyncMode::UnsafeNoSync as u8 {
return Ok(());
}
self.inner.sync_all().await
}

pub async fn sync_data(&self) -> Result<(), Error> {
if SYNC_MODE.load(std::sync::atomic::Ordering::Relaxed) == SyncMode::UnsafeNoSync as u8 {
return Ok(());
}
self.inner.sync_data().await
}

Expand Down Expand Up @@ -233,6 +239,27 @@ impl VirtualFile {
}
}

/// Indicates whether to enable fsync, fdatasync, or O_SYNC/O_DSYNC when writing
/// files. Switching this off is unsafe and only used for testing on machines
/// with slow drives.
#[repr(u8)]
pub enum SyncMode {
Sync,
UnsafeNoSync,
}

impl TryFrom<u8> for SyncMode {
type Error = u8;

fn try_from(value: u8) -> Result<Self, Self::Error> {
Ok(match value {
v if v == (SyncMode::Sync as u8) => SyncMode::Sync,
v if v == (SyncMode::UnsafeNoSync as u8) => SyncMode::UnsafeNoSync,
x => return Err(x),
})
}
}

///
/// A virtual file descriptor. You can use this just like std::fs::File, but internally
/// the underlying file is closed if the system is low on file descriptors,
Expand Down Expand Up @@ -1332,12 +1359,13 @@ impl OpenFiles {
/// server startup.
///
#[cfg(not(test))]
pub fn init(num_slots: usize, engine: IoEngineKind, mode: IoMode) {
pub fn init(num_slots: usize, engine: IoEngineKind, mode: IoMode, sync_mode: SyncMode) {
if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() {
panic!("virtual_file::init called twice");
}
set_io_mode(mode);
io_engine::init(engine);
SYNC_MODE.store(sync_mode as u8, std::sync::atomic::Ordering::Relaxed);
crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
}

Expand Down Expand Up @@ -1379,6 +1407,9 @@ pub(crate) fn set_io_mode(mode: IoMode) {
pub(crate) fn get_io_mode() -> IoMode {
IoMode::try_from(IO_MODE.load(Ordering::Relaxed)).unwrap()
}

static SYNC_MODE: AtomicU8 = AtomicU8::new(SyncMode::Sync as u8);

#[cfg(test)]
mod tests {
use crate::context::DownloadBehavior;
Expand Down

1 comment on commit 261d065

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

5581 tests run: 5327 passed, 1 failed, 253 skipped (full report)


Failures on Postgres 16

  • test_compaction_l0_memory[github-actions-selfhosted]: release-x86-64
# Run all failed tests locally:
scripts/pytest -vv -n $(nproc) -k "test_compaction_l0_memory[release-pg16-github-actions-selfhosted]"

Code coverage* (full report)

  • functions: 31.5% (7930 of 25166 functions)
  • lines: 49.7% (62850 of 126578 lines)

* collected from Rust tests only


The comment gets automatically updated with the latest test results
261d065 at 2024-11-18T10:36:21.958Z :recycle:

Please sign in to comment.