Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PVF: Add worker check during tests and benches #1771

Merged
merged 16 commits into from
Oct 24, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions .cargo/config.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,14 @@ rustflags = [
"-Aclippy::stable_sort_primitive", # prefer stable sort
"-Aclippy::extra-unused-type-parameters", # stylistic
]

[env]
# The version of the node.
#
# This is the version that is used for versioning this node binary.
# By default the `minor` version is bumped in every release. `Major` or `patch` releases are only
# expected in very rare cases.
#
# The worker binaries associated to the node binary should ensure that they are using the same
# version as the main node that started them.
POLKADOT_NODE_VERSION = "1.1.0"
mrcnski marked this conversation as resolved.
Show resolved Hide resolved
4 changes: 3 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

10 changes: 0 additions & 10 deletions polkadot/cli/src/cli.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,6 @@
use clap::Parser;
use std::path::PathBuf;

/// The version of the node.
///
/// This is the version that is used for versioning this node binary.
/// By default the `minor` version is bumped in every release. `Major` or `patch` releases are only
/// expected in very rare cases.
///
/// The worker binaries associated to the node binary should ensure that they are using the same
/// version as the main node that started them.
pub const NODE_VERSION: &'static str = "1.1.0";

#[allow(missing_docs)]
#[derive(Debug, Parser)]
pub enum Subcommand {
Expand Down
11 changes: 7 additions & 4 deletions polkadot/cli/src/command.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
// You should have received a copy of the GNU General Public License
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.

use crate::cli::{Cli, Subcommand, NODE_VERSION};
use crate::cli::{Cli, Subcommand};
use frame_benchmarking_cli::{BenchmarkCmd, ExtrinsicFactory, SUBSTRATE_REFERENCE_HARDWARE};
use futures::future::TryFutureExt;
use log::{info, warn};
Expand Down Expand Up @@ -50,7 +50,7 @@ impl SubstrateCli for Cli {

fn impl_version() -> String {
let commit_hash = env!("SUBSTRATE_CLI_COMMIT_HASH");
format!("{NODE_VERSION}-{commit_hash}")
format!("{}-{commit_hash}", env!("POLKADOT_NODE_VERSION"))
}

fn description() -> String {
Expand Down Expand Up @@ -242,8 +242,11 @@ where
None
};

let node_version =
if cli.run.disable_worker_version_check { None } else { Some(NODE_VERSION.to_string()) };
let node_version = if cli.run.disable_worker_version_check {
None
} else {
Some(env!("POLKADOT_NODE_VERSION").to_string())
};

runner.run_node_until_exit(move |config| async move {
let hwbench = (!cli.run.no_hardware_benchmarks)
Expand Down
10 changes: 7 additions & 3 deletions polkadot/node/core/pvf/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ cfg-if = "1.0"
futures = "0.3.21"
futures-timer = "3.0.2"
gum = { package = "tracing-gum", path = "../../gum" }
is_executable = "1.0.1"
libc = "0.2.139"
pin-project = "1.0.9"
rand = "0.8.5"
Expand All @@ -34,19 +35,22 @@ sp-maybe-compressed-blob = { path = "../../../../substrate/primitives/maybe-comp
polkadot-node-core-pvf-prepare-worker = { path = "prepare-worker", optional = true }
polkadot-node-core-pvf-execute-worker = { path = "execute-worker", optional = true }

[build-dependencies]
substrate-build-script-utils = { path = "../../../../substrate/utils/build-script-utils" }

[dev-dependencies]
assert_matches = "1.4.0"
criterion = { version = "0.4.0", default-features = false, features = ["cargo_bench_support", "async_tokio"] }
hex-literal = "0.4.1"
polkadot-node-core-pvf-common = { path = "common", features = ["test-utils"] }
# For the puppet worker, depend on ourselves with the test-utils feature.
polkadot-node-core-pvf = { path = ".", features = ["test-utils"] }
rococo-runtime = { path = "../../../runtime/rococo" }

adder = { package = "test-parachain-adder", path = "../../../parachain/test-parachains/adder" }
halt = { package = "test-parachain-halt", path = "../../../parachain/test-parachains/halt" }

[[bench]]
name = "host_prepare_rococo_runtime"
harness = false

[features]
ci-only-tests = []
jemalloc-allocator = [ "polkadot-node-core-pvf-common/jemalloc-allocator" ]
Expand Down
134 changes: 134 additions & 0 deletions polkadot/node/core/pvf/benches/host_prepare_rococo_runtime.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,134 @@
// Copyright (C) Parity Technologies (UK) Ltd.
// This file is part of Polkadot.

// Polkadot is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, either version 3 of the License, or
// (at your option) any later version.

// Polkadot is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.

// You should have received a copy of the GNU General Public License
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.

#[cfg(feature = "ci-only-tests")]
use assert_matches::assert_matches;
mrcnski marked this conversation as resolved.
Show resolved Hide resolved
use criterion::{criterion_group, criterion_main, Criterion, SamplingMode};
use polkadot_node_core_pvf::{
start, testing, Config, Metrics, PrepareError, PrepareJobKind, PrepareStats, PvfPrepData,
ValidationHost,
};
use polkadot_primitives::ExecutorParams;
use std::time::Duration;
use tokio::{runtime::Handle, sync::Mutex};

const TEST_PREPARATION_TIMEOUT: Duration = Duration::from_secs(30);

struct TestHost {
#[allow(unused)]
cache_dir: tempfile::TempDir,
mrcnski marked this conversation as resolved.
Show resolved Hide resolved
host: Mutex<ValidationHost>,
}

impl TestHost {
fn new_with_config<F>(handle: &Handle, f: F) -> Self
where
F: FnOnce(&mut Config),
{
let (prepare_worker_path, execute_worker_path) = testing::get_and_check_worker_paths();

let cache_dir = tempfile::tempdir().unwrap();
let mut config = Config::new(
cache_dir.path().to_owned(),
None,
prepare_worker_path,
execute_worker_path,
);
f(&mut config);
let (host, task) = start(config, Metrics::default());
let _ = handle.spawn(task);
Self { cache_dir, host: Mutex::new(host) }
}

async fn precheck_pvf(
&self,
code: &[u8],
executor_params: ExecutorParams,
) -> Result<PrepareStats, PrepareError> {
let (result_tx, result_rx) = futures::channel::oneshot::channel();

let code = sp_maybe_compressed_blob::decompress(code, 16 * 1024 * 1024)
.expect("Compression works");

self.host
.lock()
.await
.precheck_pvf(
PvfPrepData::from_code(
code.into(),
executor_params,
TEST_PREPARATION_TIMEOUT,
PrepareJobKind::Prechecking,
),
result_tx,
)
.await
.unwrap();
result_rx.await.unwrap()
}

async fn prune_all_artifacts(&self) -> Result<usize, std::io::Error> {
let (result_tx, result_rx) = futures::channel::oneshot::channel();

self.host.lock().await.prune_all_artifacts(result_tx).await.unwrap();
result_rx.await.unwrap()
}
}

fn host_prepare_rococo_runtime(c: &mut Criterion) {
polkadot_node_core_pvf_common::sp_tracing::try_init_simple();

let rt = tokio::runtime::Runtime::new().unwrap();

let host = TestHost::new_with_config(rt.handle(), |cfg| {
cfg.prepare_workers_hard_max_num = 1;
});

let blob = rococo_runtime::WASM_BINARY.unwrap();
let pvf = match sp_maybe_compressed_blob::decompress(&blob, 64 * 1024 * 1024) {
Ok(code) => PvfPrepData::from_code(
code.into_owned(),
ExecutorParams::default(),
Duration::from_secs(360),
PrepareJobKind::Compilation,
),
Err(e) => {
panic!("Cannot decompress blob: {:?}", e);
},
};

let mut group = c.benchmark_group("rococo");
group.sampling_mode(SamplingMode::Flat);
group.sample_size(20);
group.measurement_time(Duration::from_secs(240));
// The host spinning up a worker will be done in criterion's "warmup" stage, and not counted in
// the results.
group.bench_function("host: prepare Rococo runtime", |b| {
b.to_async(&rt).iter(|| async {
// `PvfPrepData` is designed to be cheap to clone, so cloning shouldn't affect the
// benchmark accuracy
let _stats = host.precheck_pvf(&pvf.clone().code(), Default::default()).await.unwrap();

// Delete the prepared artifact. Otherwise the next iterations will immediately finish.
let num_deleted = host.prune_all_artifacts().await.unwrap();
mrcnski marked this conversation as resolved.
Show resolved Hide resolved
assert_eq!(num_deleted, 1);
})
});
group.finish();
}

criterion_group!(preparation, host_prepare_rococo_runtime);
criterion_main!(preparation);
12 changes: 11 additions & 1 deletion polkadot/node/core/pvf/common/src/worker/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,14 @@ use tokio::{io, runtime::Runtime};
/// spawning the desired worker.
#[macro_export]
macro_rules! decl_worker_main {
($expected_command:expr, $entrypoint:expr, $worker_version:expr $(,)*) => {
($expected_command:expr, $entrypoint:expr, $worker_version:expr, $worker_version_hash:expr $(,)*) => {
fn get_full_version() -> String {
format!("{}-{}", $worker_version, $worker_version_hash)
}

fn print_help(expected_command: &str) {
println!("{} {}", expected_command, $worker_version);
println!("commit: {}", $worker_version_hash);
println!();
println!("PVF worker that is called by polkadot.");
}
Expand Down Expand Up @@ -67,6 +72,11 @@ macro_rules! decl_worker_main {
println!("{}", $worker_version);
return
},
// Useful for debugging. --version is used for version checks.
"--full-version" => {
println!("{}", get_full_version());
return
},

"--check-can-enable-landlock" => {
#[cfg(target_os = "linux")]
Expand Down
1 change: 1 addition & 0 deletions polkadot/node/core/pvf/src/artifacts.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ impl ArtifactPathId {
}
}

#[derive(Debug)]
pub enum ArtifactState {
/// The artifact is ready to be used by the executor.
///
Expand Down
54 changes: 47 additions & 7 deletions polkadot/node/core/pvf/src/host.rs
Original file line number Diff line number Diff line change
Expand Up @@ -130,12 +130,33 @@ impl ValidationHost {
.await
.map_err(|_| "the inner loop hung up".to_string())
}

/// TEST-ONLY: resets the artifacts table.
#[cfg(feature = "test-utils")]
pub async fn prune_all_artifacts(
&mut self,
result_tx: oneshot::Sender<Result<usize, std::io::Error>>,
) -> Result<(), String> {
self.to_host_tx
.send(ToHost::PruneAllArtifacts { result_tx })
.await
.map_err(|_| "the inner loop hung up".to_string())
}
mrcnski marked this conversation as resolved.
Show resolved Hide resolved
}

enum ToHost {
PrecheckPvf { pvf: PvfPrepData, result_tx: PrepareResultSender },
PrecheckPvf {
pvf: PvfPrepData,
result_tx: PrepareResultSender,
},
ExecutePvf(ExecutePvfInputs),
HeadsUp { active_pvfs: Vec<PvfPrepData> },
HeadsUp {
active_pvfs: Vec<PvfPrepData>,
},
#[cfg(feature = "test-utils")]
PruneAllArtifacts {
mrcnski marked this conversation as resolved.
Show resolved Hide resolved
result_tx: oneshot::Sender<Result<usize, std::io::Error>>,
},
}

struct ExecutePvfInputs {
Expand Down Expand Up @@ -436,6 +457,25 @@ async fn handle_to_host(
},
ToHost::HeadsUp { active_pvfs } =>
handle_heads_up(artifacts, prepare_queue, active_pvfs).await?,
#[cfg(feature = "test-utils")]
ToHost::PruneAllArtifacts { result_tx } => {
let to_remove = artifacts.prune(Duration::ZERO);
gum::debug!(
target: LOG_TARGET,
"pruning all artifacts: {:?}",
to_remove
);
let mut result_to_send = Ok(to_remove.len());
for artifact_id in to_remove {
let artifact_path = artifact_id.path(cache_path);
let result = tokio::fs::remove_file(&artifact_path).await;
if let Err(err) = result {
result_to_send = Err(err);
break
}
}
let _ = result_tx.send(result_to_send);
},
}

Ok(())
Expand All @@ -446,7 +486,8 @@ async fn handle_to_host(
/// This tries to prepare the PVF by compiling the WASM blob within a timeout set in
/// `PvfPrepData`.
///
/// If the prepare job failed previously, we may retry it under certain conditions.
/// We don't retry artifacts that previously failed preparation. We don't expect multiple
/// pre-checking requests.
async fn handle_precheck_pvf(
artifacts: &mut Artifacts,
prepare_queue: &mut mpsc::Sender<prepare::ToQueue>,
Expand All @@ -464,8 +505,7 @@ async fn handle_precheck_pvf(
ArtifactState::Preparing { waiting_for_response, num_failures: _ } =>
waiting_for_response.push(result_sender),
ArtifactState::FailedToProcess { error, .. } => {
// Do not retry failed preparation if another pre-check request comes in. We do not
// retry pre-checking, anyway.
// Do not retry an artifact that previously failed preparation.
let _ = result_sender.send(PrepareResult::Err(error.clone()));
},
}
Expand Down Expand Up @@ -764,7 +804,7 @@ async fn handle_prepare_done(
let last_time_failed = SystemTime::now();
let num_failures = *num_failures + 1;

gum::warn!(
gum::error!(
target: LOG_TARGET,
?artifact_id,
time_failed = ?last_time_failed,
Expand Down Expand Up @@ -846,7 +886,7 @@ async fn sweeper_task(mut sweeper_rx: mpsc::Receiver<PathBuf>) {
gum::trace!(
target: LOG_TARGET,
?result,
"Sweeping the artifact file {}",
"Sweeped the artifact file {}",
condemned.display(),
);
},
Expand Down
Loading