Skip to content

Commit

Permalink
Refactor timeline creation on safekeepers, allowing storing peer ids.
Browse files Browse the repository at this point in the history
Have separate routine and http endpoint to create timeline on safekeepers. It is
not used yet, i.e. timeline is still created implicitly, but we'll change that
once infrastructure for learning which tlis are assigned to which safekeepers
will be ready, preventing accidental creation by compute.

Changes format of safekeeper control file, allowing to store set of
peers. Knowing peers provides a part of foundation for peer
recovery (calculating min horizons like truncate_lsn for WAL truncation and
commit_lsn for sync-safekeepers replacement) and proper membership change;
similarly, we don't yet use it for now.

Employing cf file version bump, extracts tenant_id and timeline_id to top level
where it is more suitable. Also adds a bunch of LSNs there and rename
truncate_lsn to more specific peer_horizon_lsn.
  • Loading branch information
arssher committed Mar 6, 2022
1 parent 66eb2a1 commit f86cf93
Show file tree
Hide file tree
Showing 14 changed files with 406 additions and 197 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions control_plane/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,6 @@ url = "2.2.2"
reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }

pageserver = { path = "../pageserver" }
walkeeper = { path = "../walkeeper" }
zenith_utils = { path = "../zenith_utils" }
workspace_hack = { path = "../workspace_hack" }
24 changes: 23 additions & 1 deletion control_plane/src/safekeeper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,9 @@ use postgres::Config;
use reqwest::blocking::{Client, RequestBuilder, Response};
use reqwest::{IntoUrl, Method};
use thiserror::Error;
use walkeeper::http::models::TimelineCreateRequest;
use zenith_utils::http::error::HttpErrorBody;
use zenith_utils::zid::ZNodeId;
use zenith_utils::zid::{ZNodeId, ZTenantId, ZTimelineId};

use crate::local_env::{LocalEnv, SafekeeperConf};
use crate::storage::PageServerNode;
Expand Down Expand Up @@ -261,4 +262,25 @@ impl SafekeeperNode {
.error_from_body()?;
Ok(())
}

pub fn timeline_create(
&self,
tenant_id: ZTenantId,
timeline_id: ZTimelineId,
peer_ids: Vec<ZNodeId>,
) -> Result<()> {
Ok(self
.http_request(
Method::POST,
format!("{}/{}", self.http_base_url, "timeline"),
)
.json(&TimelineCreateRequest {
tenant_id,
timeline_id,
peer_ids,
})
.send()?
.error_from_body()?
.json()?)
}
}
7 changes: 2 additions & 5 deletions walkeeper/src/bin/safekeeper.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ use std::io::{ErrorKind, Write};
use std::path::{Path, PathBuf};
use std::thread;
use tracing::*;
use walkeeper::control_file::{self, CreateControlFile};
use walkeeper::control_file::{self};
use zenith_utils::http::endpoint;
use zenith_utils::zid::ZNodeId;
use zenith_utils::{logging, tcp_listener, GIT_VERSION};
Expand Down Expand Up @@ -108,10 +108,7 @@ fn main() -> Result<()> {
.get_matches();

if let Some(addr) = arg_matches.value_of("dump-control-file") {
let state = control_file::FileStorage::load_control_file(
Path::new(addr),
CreateControlFile::False,
)?;
let state = control_file::FileStorage::load_control_file(Path::new(addr))?;
let json = serde_json::to_string(&state)?;
print!("{}", json);
return Ok(());
Expand Down
110 changes: 47 additions & 63 deletions walkeeper/src/control_file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -27,13 +27,6 @@ const CONTROL_FILE_NAME: &str = "safekeeper.control";
const CONTROL_FILE_NAME_PARTIAL: &str = "safekeeper.control.partial";
pub const CHECKSUM_SIZE: usize = std::mem::size_of::<u32>();

// A named boolean.
#[derive(Debug)]
pub enum CreateControlFile {
True,
False,
}

lazy_static! {
static ref PERSIST_CONTROL_FILE_SECONDS: HistogramVec = register_histogram_vec!(
"safekeeper_persist_control_file_seconds",
Expand Down Expand Up @@ -94,28 +87,22 @@ impl FileStorage {
pub fn load_control_file_conf(
conf: &SafeKeeperConf,
zttid: &ZTenantTimelineId,
create: CreateControlFile,
) -> Result<SafeKeeperState> {
let path = conf.timeline_dir(zttid).join(CONTROL_FILE_NAME);
Self::load_control_file(path, create)
Self::load_control_file(path)
}

/// Read in the control file.
/// If create=false and file doesn't exist, bails out.
pub fn load_control_file<P: AsRef<Path>>(
control_file_path: P,
create: CreateControlFile,
) -> Result<SafeKeeperState> {
pub fn load_control_file<P: AsRef<Path>>(control_file_path: P) -> Result<SafeKeeperState> {
info!(
"loading control file {}, create={:?}",
"loading control file {}",
control_file_path.as_ref().display(),
create,
);

let mut control_file = OpenOptions::new()
.read(true)
.write(true)
.create(matches!(create, CreateControlFile::True))
.open(&control_file_path)
.with_context(|| {
format!(
Expand All @@ -124,41 +111,32 @@ impl FileStorage {
)
})?;

// Empty file is legit on 'create', don't try to deser from it.
let state = if control_file.metadata().unwrap().len() == 0 {
if let CreateControlFile::False = create {
bail!("control file is empty");
}
SafeKeeperState::new()
} else {
let mut buf = Vec::new();
control_file
.read_to_end(&mut buf)
.context("failed to read control file")?;

let calculated_checksum = crc32c::crc32c(&buf[..buf.len() - CHECKSUM_SIZE]);

let expected_checksum_bytes: &[u8; CHECKSUM_SIZE] =
buf[buf.len() - CHECKSUM_SIZE..].try_into()?;
let expected_checksum = u32::from_le_bytes(*expected_checksum_bytes);

ensure!(
calculated_checksum == expected_checksum,
let mut buf = Vec::new();
control_file
.read_to_end(&mut buf)
.context("failed to read control file")?;

let calculated_checksum = crc32c::crc32c(&buf[..buf.len() - CHECKSUM_SIZE]);

let expected_checksum_bytes: &[u8; CHECKSUM_SIZE] =
buf[buf.len() - CHECKSUM_SIZE..].try_into()?;
let expected_checksum = u32::from_le_bytes(*expected_checksum_bytes);

ensure!(
calculated_checksum == expected_checksum,
format!(
"safekeeper control file checksum mismatch: expected {} got {}",
expected_checksum, calculated_checksum
)
);

let state = FileStorage::deser_sk_state(&mut &buf[..buf.len() - CHECKSUM_SIZE])
.with_context(|| {
format!(
"safekeeper control file checksum mismatch: expected {} got {}",
expected_checksum, calculated_checksum
"while reading control file {}",
control_file_path.as_ref().display(),
)
);

FileStorage::deser_sk_state(&mut &buf[..buf.len() - CHECKSUM_SIZE]).with_context(
|| {
format!(
"while reading control file {}",
control_file_path.as_ref().display(),
)
},
)?
};
})?;
Ok(state)
}
}
Expand Down Expand Up @@ -247,51 +225,57 @@ mod test {
fn load_from_control_file(
conf: &SafeKeeperConf,
zttid: &ZTenantTimelineId,
create: CreateControlFile,
) -> Result<(FileStorage, SafeKeeperState)> {
fs::create_dir_all(&conf.timeline_dir(zttid)).expect("failed to create timeline dir");
Ok((
FileStorage::new(zttid, conf),
FileStorage::load_control_file_conf(conf, zttid, create)?,
FileStorage::load_control_file_conf(conf, zttid)?,
))
}

fn create(
conf: &SafeKeeperConf,
zttid: &ZTenantTimelineId,
) -> Result<(FileStorage, SafeKeeperState)> {
fs::create_dir_all(&conf.timeline_dir(zttid)).expect("failed to create timeline dir");
let state = SafeKeeperState::empty();
let mut storage = FileStorage::new(zttid, conf);
storage.persist(&state)?;
Ok((storage, state))
}

#[test]
fn test_read_write_safekeeper_state() {
let conf = stub_conf();
let zttid = ZTenantTimelineId::generate();
{
let (mut storage, mut state) =
load_from_control_file(&conf, &zttid, CreateControlFile::True)
.expect("failed to read state");
let (mut storage, mut state) = create(&conf, &zttid).expect("failed to create state");
// change something
state.wal_start_lsn = Lsn(42);
state.commit_lsn = Lsn(42);
storage.persist(&state).expect("failed to persist state");
}

let (_, state) = load_from_control_file(&conf, &zttid, CreateControlFile::False)
.expect("failed to read state");
assert_eq!(state.wal_start_lsn, Lsn(42));
let (_, state) = load_from_control_file(&conf, &zttid).expect("failed to read state");
assert_eq!(state.commit_lsn, Lsn(42));
}

#[test]
fn test_safekeeper_state_checksum_mismatch() {
let conf = stub_conf();
let zttid = ZTenantTimelineId::generate();
{
let (mut storage, mut state) =
load_from_control_file(&conf, &zttid, CreateControlFile::True)
.expect("failed to read state");
let (mut storage, mut state) = create(&conf, &zttid).expect("failed to read state");

// change something
state.wal_start_lsn = Lsn(42);
state.commit_lsn = Lsn(42);
storage.persist(&state).expect("failed to persist state");
}
let control_path = conf.timeline_dir(&zttid).join(CONTROL_FILE_NAME);
let mut data = fs::read(&control_path).unwrap();
data[0] += 1; // change the first byte of the file to fail checksum validation
fs::write(&control_path, &data).expect("failed to write control file");

match load_from_control_file(&conf, &zttid, CreateControlFile::False) {
match load_from_control_file(&conf, &zttid) {
Err(err) => assert!(err
.to_string()
.contains("safekeeper control file checksum mismatch")),
Expand Down
78 changes: 71 additions & 7 deletions walkeeper/src/control_file_upgrade.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
//! Code to deal with safekeeper control file upgrades
use crate::safekeeper::{
AcceptorState, PgUuid, SafeKeeperState, ServerInfo, Term, TermHistory, TermSwitchEntry,
AcceptorState, Peers, PgUuid, SafeKeeperState, ServerInfo, Term, TermHistory, TermSwitchEntry,
};
use anyhow::{bail, Result};
use serde::{Deserialize, Serialize};
Expand All @@ -26,7 +26,7 @@ struct SafeKeeperStateV1 {
/// persistent acceptor state
acceptor_state: AcceptorStateV1,
/// information about server
server: ServerInfo,
server: ServerInfoV2,
/// Unique id of the last *elected* proposer we dealed with. Not needed
/// for correctness, exists for monitoring purposes.
proposer_uuid: PgUuid,
Expand Down Expand Up @@ -70,6 +70,39 @@ pub struct SafeKeeperStateV2 {
pub wal_start_lsn: Lsn,
}

#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
pub struct ServerInfoV3 {
/// Postgres server version
pub pg_version: u32,
pub system_id: SystemId,
#[serde(with = "hex")]
pub tenant_id: ZTenantId,
/// Zenith timelineid
#[serde(with = "hex")]
pub timeline_id: ZTimelineId,
pub wal_seg_size: u32,
}

#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SafeKeeperStateV3 {
/// persistent acceptor state
pub acceptor_state: AcceptorState,
/// information about server
pub server: ServerInfoV3,
/// Unique id of the last *elected* proposer we dealed with. Not needed
/// for correctness, exists for monitoring purposes.
#[serde(with = "hex")]
pub proposer_uuid: PgUuid,
/// part of WAL acknowledged by quorum and available locally
pub commit_lsn: Lsn,
/// minimal LSN which may be needed for recovery of some safekeeper (end_lsn
/// of last record streamed to everyone)
pub truncate_lsn: Lsn,
// Safekeeper starts receiving WAL from this LSN, zeros before it ought to
// be skipped during decoding.
pub wal_start_lsn: Lsn,
}

pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState> {
// migrate to storing full term history
if version == 1 {
Expand All @@ -83,12 +116,20 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState>
}]),
};
return Ok(SafeKeeperState {
tenant_id: oldstate.server.tenant_id,
timeline_id: oldstate.server.ztli,
acceptor_state: ac,
server: oldstate.server.clone(),
server: ServerInfo {
pg_version: oldstate.server.pg_version,
system_id: oldstate.server.system_id,
wal_seg_size: oldstate.server.wal_seg_size,
},
proposer_uuid: oldstate.proposer_uuid,
commit_lsn: oldstate.commit_lsn,
truncate_lsn: oldstate.truncate_lsn,
wal_start_lsn: oldstate.wal_start_lsn,
s3_wal_lsn: Lsn(0),
peer_horizon_lsn: oldstate.truncate_lsn,
remote_consistent_lsn: Lsn(0),
peers: Peers(vec![]),
});
// migrate to hexing some zids
} else if version == 2 {
Expand All @@ -97,17 +138,40 @@ pub fn upgrade_control_file(buf: &[u8], version: u32) -> Result<SafeKeeperState>
let server = ServerInfo {
pg_version: oldstate.server.pg_version,
system_id: oldstate.server.system_id,
wal_seg_size: oldstate.server.wal_seg_size,
};
return Ok(SafeKeeperState {
tenant_id: oldstate.server.tenant_id,
timeline_id: oldstate.server.ztli,
acceptor_state: oldstate.acceptor_state,
server,
proposer_uuid: oldstate.proposer_uuid,
commit_lsn: oldstate.commit_lsn,
s3_wal_lsn: Lsn(0),
peer_horizon_lsn: oldstate.truncate_lsn,
remote_consistent_lsn: Lsn(0),
peers: Peers(vec![]),
});
// migrate to moving ztenantid/ztli to the top and adding some lsns
} else if version == 3 {
info!("reading safekeeper control file version {}", version);
let oldstate = SafeKeeperStateV3::des(&buf[..buf.len()])?;
let server = ServerInfo {
pg_version: oldstate.server.pg_version,
system_id: oldstate.server.system_id,
wal_seg_size: oldstate.server.wal_seg_size,
};
return Ok(SafeKeeperState {
tenant_id: oldstate.server.tenant_id,
timeline_id: oldstate.server.timeline_id,
acceptor_state: oldstate.acceptor_state,
server,
proposer_uuid: oldstate.proposer_uuid,
commit_lsn: oldstate.commit_lsn,
truncate_lsn: oldstate.truncate_lsn,
wal_start_lsn: oldstate.wal_start_lsn,
s3_wal_lsn: Lsn(0),
peer_horizon_lsn: oldstate.truncate_lsn,
remote_consistent_lsn: Lsn(0),
peers: Peers(vec![]),
});
}
bail!("unsupported safekeeper control file version {}", version)
Expand Down
Loading

0 comments on commit f86cf93

Please sign in to comment.