diff --git a/Cargo.lock b/Cargo.lock index a19b80b2f4d7..110f51d75e80 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -120,9 +120,9 @@ dependencies = [ [[package]] name = "anyhow" -version = "1.0.34" +version = "1.0.39" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bf8dcb5b4bbaa28653b647d8c77bd4ed40183b48882e130c1f1ffb73de069fd7" +checksum = "81cddc5f91628367664cc7c69714ff08deee8a3efc54623011c772544d7b2767" [[package]] name = "approx" @@ -204,6 +204,16 @@ version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9b34d609dfbaf33d6889b2b7106d3ca345eacad44200913df5ba02bfd31d2ba9" +[[package]] +name = "async-attributes" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3203e79f4dd9bdda415ed03cf14dae5a2bf775c683a00f94e9cd1faf0f596e5" +dependencies = [ + "quote", + "syn", +] + [[package]] name = "async-channel" version = "1.5.1" @@ -293,6 +303,7 @@ version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8f9f84f1280a2b436a2c77c2582602732b6c2f4321d5494d6e799e6c367859a8" dependencies = [ + "async-attributes", "async-channel", "async-global-executor", "async-io", @@ -5579,7 +5590,7 @@ dependencies = [ "frame-benchmarking-cli", "futures 0.3.13", "log", - "polkadot-parachain", + "polkadot-node-core-pvf", "polkadot-service", "sc-cli", "sc-service", @@ -5823,8 +5834,10 @@ name = "polkadot-node-core-candidate-validation" version = "0.1.0" dependencies = [ "assert_matches", + "async-trait", "futures 0.3.13", "parity-scale-codec", + "polkadot-node-core-pvf", "polkadot-node-primitives", "polkadot-node-subsystem", "polkadot-node-subsystem-test-helpers", @@ -5893,6 +5906,38 @@ dependencies = [ "tracing", ] +[[package]] +name = "polkadot-node-core-pvf" +version = "0.1.0" +dependencies = [ + "always-assert", + "assert_matches", + "async-process", + "async-std", + "futures 0.3.13", + "futures-timer 3.0.2", + "hex-literal", + "libc", + "parity-scale-codec", + "pin-project 1.0.4", + "polkadot-core-primitives", + "polkadot-parachain", + "rand 0.8.3", + "sc-executor", + "sc-executor-common", + "sc-executor-wasmtime", + "slotmap", + "sp-core", + "sp-externalities", + "sp-io", + "sp-wasm-interface", + "tempfile", + "test-parachain-adder", + "test-parachain-halt", + "tracing", + "wasmtime-jit", +] + [[package]] name = "polkadot-node-core-runtime-api" version = "0.1.0" @@ -6074,25 +6119,13 @@ name = "polkadot-parachain" version = "0.8.30" dependencies = [ "derive_more", - "futures 0.3.13", - "libc", - "log", "parity-scale-codec", "parity-util-mem", - "parking_lot 0.11.1", "polkadot-core-primitives", - "raw_sync", - "sc-executor", "serde", - "shared_memory", "sp-core", - "sp-externalities", - "sp-io", "sp-runtime", "sp-std", - "sp-wasm-interface", - "static_assertions", - "thiserror", ] [[package]] @@ -7031,19 +7064,6 @@ dependencies = [ "rand_core 0.5.1", ] -[[package]] -name = "raw_sync" -version = "0.1.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a34bde3561f980a51c70495164200569a11662644fe5af017f0b5d7015688cc" -dependencies = [ - "cfg-if 0.1.10", - "libc", - "nix", - "rand 0.8.3", - "winapi 0.3.9", -] - [[package]] name = "rawpointer" version = "0.2.1" @@ -8704,20 +8724,6 @@ dependencies = [ "loom", ] -[[package]] -name = "shared_memory" -version = "0.11.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b854a362375dfe8ab12ea8a98228040d37293c988f85fbac9fa0f83336387966" -dependencies = [ - "cfg-if 0.1.10", - "libc", - "nix", - "quick-error 2.0.0", - "rand 0.8.3", - "winapi 0.3.9", -] - [[package]] name = "shlex" version = "0.1.1" @@ -8778,6 +8784,15 @@ dependencies = [ "sp-std", ] +[[package]] +name = "slotmap" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab3003725ae562cf995f3dc82bb99e70926e09000396816765bb6d7adbe740b1" +dependencies = [ + "version_check", +] + [[package]] name = "smallvec" version = "0.6.13" @@ -9868,6 +9883,7 @@ dependencies = [ "log", "parity-scale-codec", "polkadot-cli", + "polkadot-node-core-pvf", "polkadot-node-primitives", "polkadot-node-subsystem", "polkadot-parachain", diff --git a/Cargo.toml b/Cargo.toml index 17ba7ebea45e..65627b1fafce 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -55,6 +55,7 @@ members = [ "node/core/chain-api", "node/core/proposer", "node/core/provisioner", + "node/core/pvf", "node/core/runtime-api", "node/network/approval-distribution", "node/network/bridge", diff --git a/cli/Cargo.toml b/cli/Cargo.toml index 544e71d591ad..48717cdb39db 100644 --- a/cli/Cargo.toml +++ b/cli/Cargo.toml @@ -22,7 +22,7 @@ wasm-bindgen-futures = { version = "0.4.19", optional = true } futures = "0.3.12" service = { package = "polkadot-service", path = "../node/service", default-features = false, optional = true } -polkadot-parachain = { path = "../parachain", optional = true } +polkadot-node-core-pvf = { path = "../node/core/pvf", optional = true } sp-core = { git = "https://github.com/paritytech/substrate", branch = "master" } frame-benchmarking-cli = { git = "https://github.com/paritytech/substrate", branch = "master", optional = true } @@ -39,8 +39,8 @@ sp-trie = { git = "https://github.com/paritytech/substrate", branch = "master", substrate-build-script-utils = { git = "https://github.com/paritytech/substrate", branch = "master" } [features] -default = [ "wasmtime", "db", "cli", "full-node", "trie-memory-tracker", "polkadot-parachain" ] -wasmtime = [ "sc-cli/wasmtime", "polkadot-parachain/wasmtime" ] +default = [ "wasmtime", "db", "cli", "full-node", "trie-memory-tracker" ] +wasmtime = [ "sc-cli/wasmtime" ] db = [ "service/db" ] cli = [ "structopt", @@ -48,6 +48,7 @@ cli = [ "sc-service", "frame-benchmarking-cli", "try-runtime-cli", + "polkadot-node-core-pvf", ] browser = [ "wasm-bindgen", diff --git a/cli/src/cli.rs b/cli/src/cli.rs index 2dd56bc1ad42..8ebf5b23235c 100644 --- a/cli/src/cli.rs +++ b/cli/src/cli.rs @@ -43,8 +43,12 @@ pub enum Subcommand { Revert(sc_cli::RevertCmd), #[allow(missing_docs)] - #[structopt(name = "validation-worker", setting = structopt::clap::AppSettings::Hidden)] - ValidationWorker(ValidationWorkerCommand), + #[structopt(name = "prepare-worker", setting = structopt::clap::AppSettings::Hidden)] + PvfPrepareWorker(ValidationWorkerCommand), + + #[allow(missing_docs)] + #[structopt(name = "execute-worker", setting = structopt::clap::AppSettings::Hidden)] + PvfExecuteWorker(ValidationWorkerCommand), /// The custom benchmark subcommand benchmarking runtime pallets. #[structopt( @@ -64,11 +68,8 @@ pub enum Subcommand { #[allow(missing_docs)] #[derive(Debug, StructOpt)] pub struct ValidationWorkerCommand { - /// The path that the executor can use for its caching purposes. - pub cache_base_path: std::path::PathBuf, - - #[allow(missing_docs)] - pub mem_id: String, + /// The path to the validation host's socket. + pub socket_path: String, } #[allow(missing_docs)] diff --git a/cli/src/command.rs b/cli/src/command.rs index 99066756cb93..d0a5bc110cd9 100644 --- a/cli/src/command.rs +++ b/cli/src/command.rs @@ -256,19 +256,39 @@ pub fn run() -> Result<()> { Ok((cmd.run(client, backend).map_err(Error::SubstrateCli), task_manager)) })?) }, - Some(Subcommand::ValidationWorker(cmd)) => { + Some(Subcommand::PvfPrepareWorker(cmd)) => { let mut builder = sc_cli::LoggerBuilder::new(""); builder.with_colors(false); let _ = builder.init(); - if cfg!(feature = "browser") || cfg!(target_os = "android") { - Err(sc_cli::Error::Input("Cannot run validation worker in browser".into()).into()) - } else { - #[cfg(not(any(target_os = "android", feature = "browser")))] - polkadot_parachain::wasm_executor::run_worker( - &cmd.mem_id, - Some(cmd.cache_base_path.clone()), - )?; + #[cfg(any(target_os = "android", feature = "browser"))] + { + return Err( + sc_cli::Error::Input("PVF preparation workers are not supported under this platform".into()).into() + ); + } + + #[cfg(not(any(target_os = "android", feature = "browser")))] + { + polkadot_node_core_pvf::prepare_worker_entrypoint(&cmd.socket_path); + Ok(()) + } + }, + Some(Subcommand::PvfExecuteWorker(cmd)) => { + let mut builder = sc_cli::LoggerBuilder::new(""); + builder.with_colors(false); + let _ = builder.init(); + + #[cfg(any(target_os = "android", feature = "browser"))] + { + return Err( + sc_cli::Error::Input("PVF execution workers are not supported under this platform".into()).into() + ); + } + + #[cfg(not(any(target_os = "android", feature = "browser")))] + { + polkadot_node_core_pvf::execute_worker_entrypoint(&cmd.socket_path); Ok(()) } }, diff --git a/node/core/candidate-validation/Cargo.toml b/node/core/candidate-validation/Cargo.toml index e4d2c5ba80ba..f48723f1def8 100644 --- a/node/core/candidate-validation/Cargo.toml +++ b/node/core/candidate-validation/Cargo.toml @@ -5,10 +5,10 @@ authors = ["Parity Technologies "] edition = "2018" [dependencies] +async-trait = "0.1.42" futures = "0.3.12" tracing = "0.1.25" -sp-core = { package = "sp-core", git = "https://github.com/paritytech/substrate", branch = "master" } sp-maybe-compressed-blob = { package = "sp-maybe-compressed-blob", git = "https://github.com/paritytech/substrate", branch = "master" } parity-scale-codec = { version = "2.0.0", default-features = false, features = ["bit-vec", "derive"] } @@ -18,8 +18,12 @@ polkadot-node-primitives = { path = "../../primitives" } polkadot-subsystem = { package = "polkadot-node-subsystem", path = "../../subsystem" } polkadot-node-subsystem-util = { path = "../../subsystem-util" } +[target.'cfg(not(any(target_os = "android", target_os = "unknown")))'.dependencies] +polkadot-node-core-pvf = { path = "../pvf" } + [dev-dependencies] sp-keyring = { git = "https://github.com/paritytech/substrate", branch = "master" } futures = { version = "0.3.12", features = ["thread-pool"] } assert_matches = "1.4.0" polkadot-node-subsystem-test-helpers = { path = "../../subsystem-test-helpers" } +sp-core = { git = "https://github.com/paritytech/substrate", branch = "master" } diff --git a/node/core/candidate-validation/src/lib.rs b/node/core/candidate-validation/src/lib.rs index 573b2f1c7110..83ea629f4974 100644 --- a/node/core/candidate-validation/src/lib.rs +++ b/node/core/candidate-validation/src/lib.rs @@ -40,44 +40,51 @@ use polkadot_primitives::v1::{ ValidationCode, CandidateDescriptor, PersistedValidationData, OccupiedCoreAssumption, Hash, CandidateCommitments, }; -use polkadot_parachain::wasm_executor::{ - self, IsolationStrategy, ValidationError, InvalidCandidate as WasmInvalidCandidate -}; -use polkadot_parachain::primitives::{ValidationResult as WasmValidationResult, ValidationParams}; +use polkadot_parachain::primitives::{ValidationParams, ValidationResult as WasmValidationResult}; +use polkadot_node_core_pvf::{Pvf, ValidationHost, ValidationError, InvalidCandidate as WasmInvalidCandidate}; use parity_scale_codec::Encode; -use sp_core::traits::SpawnNamed; use futures::channel::oneshot; use futures::prelude::*; use std::sync::Arc; +use std::path::PathBuf; + +use async_trait::async_trait; const LOG_TARGET: &'static str = "parachain::candidate-validation"; +/// Configuration for the candidate validation subsystem +pub struct Config { + /// The path where candidate validation can store compiled artifacts for PVFs. + pub artifacts_cache_path: PathBuf, + /// The path to the executable which can be used for spawning PVF compilation & validation + /// workers. + pub program_path: PathBuf, +} + /// The candidate validation subsystem. -pub struct CandidateValidationSubsystem { - spawn: S, +pub struct CandidateValidationSubsystem { metrics: Metrics, - isolation_strategy: IsolationStrategy, + config: Config, } -impl CandidateValidationSubsystem { +impl CandidateValidationSubsystem { /// Create a new `CandidateValidationSubsystem` with the given task spawner and isolation /// strategy. /// /// Check out [`IsolationStrategy`] to get more details. - pub fn new(spawn: S, metrics: Metrics, isolation_strategy: IsolationStrategy) -> Self { - CandidateValidationSubsystem { spawn, metrics, isolation_strategy } + pub fn with_config(config: Config, metrics: Metrics) -> Self { + CandidateValidationSubsystem { config, metrics, } } } -impl Subsystem for CandidateValidationSubsystem where +impl Subsystem for CandidateValidationSubsystem where C: SubsystemContext, - S: SpawnNamed + Clone + 'static, { fn start(self, ctx: C) -> SpawnedSubsystem { - let future = run(ctx, self.spawn, self.metrics, self.isolation_strategy) + let future = run(ctx, self.metrics, self.config.artifacts_cache_path, self.config.program_path) .map_err(|e| SubsystemError::with_origin("candidate-validation", e)) .boxed(); SpawnedSubsystem { @@ -87,13 +94,18 @@ impl Subsystem for CandidateValidationSubsystem where } } -#[tracing::instrument(skip(ctx, spawn, metrics), fields(subsystem = LOG_TARGET))] +#[tracing::instrument(skip(ctx, metrics), fields(subsystem = LOG_TARGET))] async fn run( mut ctx: impl SubsystemContext, - spawn: impl SpawnNamed + Clone + 'static, metrics: Metrics, - isolation_strategy: IsolationStrategy, + cache_path: PathBuf, + program_path: PathBuf, ) -> SubsystemResult<()> { + let (mut validation_host, task) = polkadot_node_core_pvf::start( + polkadot_node_core_pvf::Config::new(cache_path, program_path), + ); + ctx.spawn_blocking("pvf-validation-host", task.boxed()).await?; + loop { match ctx.recv().await? { FromOverseer::Signal(OverseerSignal::ActiveLeaves(_)) => {} @@ -109,10 +121,9 @@ async fn run( let res = spawn_validate_from_chain_state( &mut ctx, - isolation_strategy.clone(), + &mut validation_host, descriptor, pov, - spawn.clone(), &metrics, ).await; @@ -133,14 +144,12 @@ async fn run( ) => { let _timer = metrics.time_validate_from_exhaustive(); - let res = spawn_validate_exhaustive( - &mut ctx, - isolation_strategy.clone(), + let res = validate_candidate_exhaustive( + &mut validation_host, persisted_validation_data, validation_code, descriptor, pov, - spawn.clone(), &metrics, ).await; @@ -268,13 +277,16 @@ async fn find_assumed_validation_data( Ok(AssumptionCheckOutcome::DoesNotMatch) } -#[tracing::instrument(level = "trace", skip(ctx, pov, spawn, metrics), fields(subsystem = LOG_TARGET))] +#[tracing::instrument( + level = "trace", + skip(ctx, validation_host, pov, metrics), + fields(subsystem = LOG_TARGET), +)] async fn spawn_validate_from_chain_state( ctx: &mut impl SubsystemContext, - isolation_strategy: IsolationStrategy, + validation_host: &mut ValidationHost, descriptor: CandidateDescriptor, pov: Arc, - spawn: impl SpawnNamed + 'static, metrics: &Metrics, ) -> SubsystemResult> { let (validation_data, validation_code) = @@ -293,14 +305,12 @@ async fn spawn_validate_from_chain_state( } }; - let validation_result = spawn_validate_exhaustive( - ctx, - isolation_strategy, + let validation_result = validate_candidate_exhaustive( + validation_host, validation_data, validation_code, descriptor.clone(), pov, - spawn, metrics, ) .await; @@ -330,113 +340,19 @@ async fn spawn_validate_from_chain_state( validation_result } -#[tracing::instrument(level = "trace", skip(ctx, validation_code, pov, spawn, metrics), fields(subsystem = LOG_TARGET))] -async fn spawn_validate_exhaustive( - ctx: &mut impl SubsystemContext, - isolation_strategy: IsolationStrategy, +#[tracing::instrument( + level = "trace", + skip(validation_backend, validation_code, pov, metrics), + fields(subsystem = LOG_TARGET), +)] +async fn validate_candidate_exhaustive( + mut validation_backend: impl ValidationBackend, persisted_validation_data: PersistedValidationData, validation_code: ValidationCode, descriptor: CandidateDescriptor, pov: Arc, - spawn: impl SpawnNamed + 'static, metrics: &Metrics, ) -> SubsystemResult> { - let (tx, rx) = oneshot::channel(); - let metrics = metrics.clone(); - let fut = async move { - let res = validate_candidate_exhaustive::( - isolation_strategy, - persisted_validation_data, - validation_code, - descriptor, - pov, - spawn, - &metrics, - ); - - let _ = tx.send(res); - }; - - ctx.spawn_blocking("blocking-candidate-validation-task", fut.boxed()).await?; - rx.await.map_err(Into::into) -} - -/// Does basic checks of a candidate. Provide the encoded PoV-block. Returns `Ok` if basic checks -/// are passed, `Err` otherwise. -#[tracing::instrument(level = "trace", skip(pov, validation_code), fields(subsystem = LOG_TARGET))] -fn perform_basic_checks( - candidate: &CandidateDescriptor, - max_pov_size: u32, - pov: &PoV, - validation_code: &ValidationCode, -) -> Result<(), InvalidCandidate> { - let pov_hash = pov.hash(); - let validation_code_hash = validation_code.hash(); - - let encoded_pov_size = pov.encoded_size(); - if encoded_pov_size > max_pov_size as usize { - return Err(InvalidCandidate::ParamsTooLarge(encoded_pov_size as u64)); - } - - if pov_hash != candidate.pov_hash { - return Err(InvalidCandidate::PoVHashMismatch); - } - - if validation_code_hash != candidate.validation_code_hash { - return Err(InvalidCandidate::CodeHashMismatch); - } - - if let Err(()) = candidate.check_collator_signature() { - return Err(InvalidCandidate::BadSignature); - } - - Ok(()) -} - -trait ValidationBackend { - type Arg; - - fn validate( - arg: Self::Arg, - raw_validation_code: &[u8], - params: ValidationParams, - spawn: S, - ) -> Result; -} - -struct RealValidationBackend; - -impl ValidationBackend for RealValidationBackend { - type Arg = IsolationStrategy; - - fn validate( - isolation_strategy: IsolationStrategy, - raw_validation_code: &[u8], - params: ValidationParams, - spawn: S, - ) -> Result { - wasm_executor::validate_candidate( - &raw_validation_code, - params, - &isolation_strategy, - spawn, - ) - } -} - -/// Validates the candidate from exhaustive parameters. -/// -/// Sends the result of validation on the channel once complete. -#[tracing::instrument(level = "trace", skip(backend_arg, validation_code, pov, spawn, metrics), fields(subsystem = LOG_TARGET))] -fn validate_candidate_exhaustive( - backend_arg: B::Arg, - persisted_validation_data: PersistedValidationData, - validation_code: ValidationCode, - descriptor: CandidateDescriptor, - pov: Arc, - spawn: S, - metrics: &Metrics, -) -> Result { let _timer = metrics.time_validate_candidate_exhaustive(); if let Err(e) = perform_basic_checks( @@ -445,7 +361,7 @@ fn validate_candidate_exhaustive( &*pov, &validation_code, ) { - return Ok(ValidationResult::Invalid(e)) + return Ok(Ok(ValidationResult::Invalid(e))); } let raw_validation_code = match sp_maybe_compressed_blob::decompress( @@ -457,7 +373,7 @@ fn validate_candidate_exhaustive( tracing::debug!(target: LOG_TARGET, err=?e, "Invalid validation code"); // If the validation code is invalid, the candidate certainly is. - return Ok(ValidationResult::Invalid(InvalidCandidate::CodeDecompressionFailure)); + return Ok(Ok(ValidationResult::Invalid(InvalidCandidate::CodeDecompressionFailure))); } }; @@ -470,7 +386,7 @@ fn validate_candidate_exhaustive( tracing::debug!(target: LOG_TARGET, err=?e, "Invalid PoV code"); // If the PoV is invalid, the candidate certainly is. - return Ok(ValidationResult::Invalid(InvalidCandidate::PoVDecompressionFailure)); + return Ok(Ok(ValidationResult::Invalid(InvalidCandidate::PoVDecompressionFailure))); } }; @@ -481,38 +397,109 @@ fn validate_candidate_exhaustive( relay_parent_storage_root: persisted_validation_data.relay_parent_storage_root, }; - match B::validate(backend_arg, &raw_validation_code, params, spawn) { - Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::Timeout)) => + let result = + validation_backend.validate_candidate( + raw_validation_code.to_vec(), + params + ) + .await; + + let result = match result { + Err(ValidationError::InternalError(e)) => Err(ValidationFailed(e)), + + Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::HardTimeout)) => Ok(ValidationResult::Invalid(InvalidCandidate::Timeout)), - Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::ParamsTooLarge(l, _))) => - Ok(ValidationResult::Invalid(InvalidCandidate::ParamsTooLarge(l as u64))), - Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::CodeTooLarge(l, _))) => - Ok(ValidationResult::Invalid(InvalidCandidate::CodeTooLarge(l as u64))), - Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::BadReturn)) => - Ok(ValidationResult::Invalid(InvalidCandidate::BadReturn)), - Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::WasmExecutor(e))) => - Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(e.to_string()))), - Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::ExternalWasmExecutor(e))) => - Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(e.to_string()))), - Err(ValidationError::Internal(e)) => Err(ValidationFailed(e.to_string())), + Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::WorkerReportedError(e))) => + Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError(e))), + Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbigiousWorkerDeath)) => + Ok(ValidationResult::Invalid(InvalidCandidate::ExecutionError("ambigious worker death".to_string()))), + Ok(res) => { if res.head_data.hash() != descriptor.para_head { - return Ok(ValidationResult::Invalid(InvalidCandidate::ParaHeadHashMismatch)); + Ok(ValidationResult::Invalid(InvalidCandidate::ParaHeadHashMismatch)) + } else { + let outputs = CandidateCommitments { + head_data: res.head_data, + upward_messages: res.upward_messages, + horizontal_messages: res.horizontal_messages, + new_validation_code: res.new_validation_code, + processed_downward_messages: res.processed_downward_messages, + hrmp_watermark: res.hrmp_watermark, + }; + Ok(ValidationResult::Valid(outputs, persisted_validation_data)) } + } + }; + + Ok(result) +} + +#[async_trait] +trait ValidationBackend { + async fn validate_candidate( + &mut self, + raw_validation_code: Vec, + params: ValidationParams + ) -> Result; +} - let outputs = CandidateCommitments { - head_data: res.head_data, - upward_messages: res.upward_messages, - horizontal_messages: res.horizontal_messages, - new_validation_code: res.new_validation_code, - processed_downward_messages: res.processed_downward_messages, - hrmp_watermark: res.hrmp_watermark, - }; - Ok(ValidationResult::Valid(outputs, persisted_validation_data)) +#[async_trait] +impl ValidationBackend for &'_ mut ValidationHost { + async fn validate_candidate( + &mut self, + raw_validation_code: Vec, + params: ValidationParams + ) -> Result { + let (tx, rx) = oneshot::channel(); + if let Err(err) = self.execute_pvf( + Pvf::from_code(raw_validation_code), + params.encode(), + polkadot_node_core_pvf::Priority::Normal, + tx, + ).await { + return Err(ValidationError::InternalError(format!("cannot send pvf to the validation host: {:?}", err))); } + + let validation_result = rx + .await + .map_err(|_| ValidationError::InternalError("validation was cancelled".into()))?; + + validation_result } } +/// Does basic checks of a candidate. Provide the encoded PoV-block. Returns `Ok` if basic checks +/// are passed, `Err` otherwise. +#[tracing::instrument(level = "trace", skip(pov, validation_code), fields(subsystem = LOG_TARGET))] +fn perform_basic_checks( + candidate: &CandidateDescriptor, + max_pov_size: u32, + pov: &PoV, + validation_code: &ValidationCode, +) -> Result<(), InvalidCandidate> { + let pov_hash = pov.hash(); + let validation_code_hash = validation_code.hash(); + + let encoded_pov_size = pov.encoded_size(); + if encoded_pov_size > max_pov_size as usize { + return Err(InvalidCandidate::ParamsTooLarge(encoded_pov_size as u64)); + } + + if pov_hash != candidate.pov_hash { + return Err(InvalidCandidate::PoVHashMismatch); + } + + if validation_code_hash != candidate.validation_code_hash { + return Err(InvalidCandidate::CodeHashMismatch); + } + + if let Err(()) = candidate.check_collator_signature() { + return Err(InvalidCandidate::BadSignature); + } + + Ok(()) +} + #[derive(Clone)] struct MetricsInner { validation_requests: prometheus::CounterVec, @@ -605,7 +592,7 @@ impl metrics::Metrics for Metrics { #[cfg(test)] mod tests { - use super::*; + use super::*; use polkadot_node_subsystem_test_helpers as test_helpers; use polkadot_primitives::v1::{HeadData, UpwardMessage}; use sp_core::testing::TaskExecutor; @@ -613,25 +600,6 @@ mod tests { use assert_matches::assert_matches; use sp_keyring::Sr25519Keyring; - struct MockValidationBackend; - - struct MockValidationArg { - result: Result, - } - - impl ValidationBackend for MockValidationBackend { - type Arg = MockValidationArg; - - fn validate( - arg: Self::Arg, - _raw_validation_code: &[u8], - _params: ValidationParams, - _spawn: S, - ) -> Result { - arg.result - } - } - fn collator_sign(descriptor: &mut CandidateDescriptor, collator: Sr25519Keyring) { descriptor.collator = collator.public().into(); let payload = polkadot_primitives::v1::collator_signature_payload( @@ -924,6 +892,29 @@ mod tests { executor::block_on(test_fut); } + struct MockValidatorBackend { + result: Result, + } + + impl MockValidatorBackend { + fn with_hardcoded_result(result: Result) -> Self { + Self { + result, + } + } + } + + #[async_trait] + impl ValidationBackend for MockValidatorBackend { + async fn validate_candidate( + &mut self, + _raw_validation_code: Vec, + _params: ValidationParams + ) -> Result { + self.result.clone() + } + } + #[test] fn candidate_validation_ok_is_ok() { let validation_data = PersistedValidationData { max_pov_size: 1024, ..Default::default() }; @@ -955,15 +946,16 @@ mod tests { hrmp_watermark: 0, }; - let v = validate_candidate_exhaustive::( - MockValidationArg { result: Ok(validation_result) }, + let v = executor::block_on(validate_candidate_exhaustive( + MockValidatorBackend::with_hardcoded_result(Ok(validation_result)), validation_data.clone(), validation_code, descriptor, Arc::new(pov), - TaskExecutor::new(), &Default::default(), - ).unwrap(); + )) + .unwrap() + .unwrap(); assert_matches!(v, ValidationResult::Valid(outputs, used_validation_data) => { assert_eq!(outputs.head_data, HeadData(vec![1, 1, 1])); @@ -995,21 +987,20 @@ mod tests { ); assert!(check.is_ok()); - let v = validate_candidate_exhaustive::( - MockValidationArg { - result: Err(ValidationError::InvalidCandidate( - WasmInvalidCandidate::BadReturn - )) - }, + let v = executor::block_on(validate_candidate_exhaustive( + MockValidatorBackend::with_hardcoded_result( + Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::AmbigiousWorkerDeath)) + ), validation_data, validation_code, descriptor, Arc::new(pov), - TaskExecutor::new(), &Default::default(), - ).unwrap(); + )) + .unwrap() + .unwrap(); - assert_matches!(v, ValidationResult::Invalid(InvalidCandidate::BadReturn)); + assert_matches!(v, ValidationResult::Invalid(InvalidCandidate::ExecutionError(_))); } #[test] @@ -1032,19 +1023,17 @@ mod tests { ); assert!(check.is_ok()); - let v = validate_candidate_exhaustive::( - MockValidationArg { - result: Err(ValidationError::InvalidCandidate( - WasmInvalidCandidate::Timeout - )) - }, + let v = executor::block_on(validate_candidate_exhaustive( + MockValidatorBackend::with_hardcoded_result( + Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::HardTimeout)), + ), validation_data, validation_code, descriptor, Arc::new(pov), - TaskExecutor::new(), &Default::default(), - ); + )) + .unwrap(); assert_matches!(v, Ok(ValidationResult::Invalid(InvalidCandidate::Timeout))); } @@ -1069,19 +1058,18 @@ mod tests { ); assert_matches!(check, Err(InvalidCandidate::CodeHashMismatch)); - let v = validate_candidate_exhaustive::( - MockValidationArg { - result: Err(ValidationError::InvalidCandidate( - WasmInvalidCandidate::BadReturn - )) - }, + let v = executor::block_on(validate_candidate_exhaustive( + MockValidatorBackend::with_hardcoded_result( + Err(ValidationError::InvalidCandidate(WasmInvalidCandidate::HardTimeout)), + ), validation_data, validation_code, descriptor, Arc::new(pov), - TaskExecutor::new(), &Default::default(), - ).unwrap(); + )) + .unwrap() + .unwrap(); assert_matches!(v, ValidationResult::Invalid(InvalidCandidate::CodeHashMismatch)); } @@ -1115,15 +1103,15 @@ mod tests { hrmp_watermark: 0, }; - let v = validate_candidate_exhaustive::( - MockValidationArg { result: Ok(validation_result) }, + let v = executor::block_on(validate_candidate_exhaustive( + MockValidatorBackend::with_hardcoded_result(Ok(validation_result)), validation_data, validation_code, descriptor, Arc::new(pov), - TaskExecutor::new(), &Default::default(), - ); + )) + .unwrap(); assert_matches!(v, Ok(ValidationResult::Valid(_, _))); } @@ -1157,15 +1145,15 @@ mod tests { hrmp_watermark: 0, }; - let v = validate_candidate_exhaustive::( - MockValidationArg { result: Ok(validation_result) }, + let v = executor::block_on(validate_candidate_exhaustive( + MockValidatorBackend::with_hardcoded_result(Ok(validation_result)), validation_data, validation_code, descriptor, Arc::new(pov), - TaskExecutor::new(), &Default::default(), - ); + )) + .unwrap(); assert_matches!( v, @@ -1206,15 +1194,15 @@ mod tests { hrmp_watermark: 0, }; - let v = validate_candidate_exhaustive::( - MockValidationArg { result: Ok(validation_result) }, + let v = executor::block_on(validate_candidate_exhaustive( + MockValidatorBackend::with_hardcoded_result(Ok(validation_result)), validation_data, validation_code, descriptor, Arc::new(pov), - TaskExecutor::new(), &Default::default(), - ); + )) + .unwrap(); assert_matches!( v, diff --git a/node/core/pvf/Cargo.toml b/node/core/pvf/Cargo.toml new file mode 100644 index 000000000000..c0fdbba9c49e --- /dev/null +++ b/node/core/pvf/Cargo.toml @@ -0,0 +1,53 @@ +[package] +name = "polkadot-node-core-pvf" +version = "0.1.0" +authors = ["Parity Technologies "] +edition = "2018" + +[[bin]] +name = "puppet_worker" +path = "bin/puppet_worker.rs" + +[dependencies] +always-assert = "0.1" +async-std = { version = "1.8.0", features = ["attributes"] } +async-process = "1.0.1" +assert_matches = "1.4.0" +futures = "0.3.12" +futures-timer = "3.0.2" +libc = "0.2.81" +slotmap = "1.0" +tracing = "0.1.22" +pin-project = "1.0.4" +rand = "0.8.3" +parity-scale-codec = { version = "2.0.0", default-features = false, features = ["derive"] } +polkadot-parachain = { path = "../../../parachain" } +polkadot-core-primitives = { path = "../../../core-primitives" } +sc-executor = { git = "https://github.com/paritytech/substrate", branch = "master" } +sc-executor-wasmtime = { git = "https://github.com/paritytech/substrate", branch = "master" } +sc-executor-common = { git = "https://github.com/paritytech/substrate", branch = "master" } +sp-externalities = { git = "https://github.com/paritytech/substrate", branch = "master" } +sp-io = { git = "https://github.com/paritytech/substrate", branch = "master" } +sp-core = { git = "https://github.com/paritytech/substrate", branch = "master" } +sp-wasm-interface = { git = "https://github.com/paritytech/substrate", branch = "master" } + +[dev-dependencies] +adder = { package = "test-parachain-adder", path = "../../../parachain/test-parachains/adder" } +halt = { package = "test-parachain-halt", path = "../../../parachain/test-parachains/halt" } +hex-literal = "0.3.1" +tempfile = "3.2.0" + +# PVF execution leverages compiled artifacts provided by wasmtime. The contents of the artifacts +# depends on the version of wasmtime. In this crate we persist the artifacts on disk so we should +# be careful about the updates. In order to handle this, we depend on the wasmtime version here +# that we think is used by the sc-executor. If wasmtime is updated in Substrate and wasn't updated +# here then there will be linking errors like +# +# `multiple definitions of `set_vmctx_memory`. +# +# or similar, because wasmtime exports these symbols and does not support multiple versions compiled +# in at the same time. +# +# Another safeguard is a test `ensure_wasmtime_version` that will fail on each bump and prompt the +# developer to correspondingly act upon the change. +wasmtime-jit = "0.24" diff --git a/node/core/pvf/bin/puppet_worker.rs b/node/core/pvf/bin/puppet_worker.rs new file mode 100644 index 000000000000..4b026e96a809 --- /dev/null +++ b/node/core/pvf/bin/puppet_worker.rs @@ -0,0 +1,17 @@ +// Copyright 2021 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +polkadot_node_core_pvf::decl_puppet_worker_main!(); diff --git a/node/core/pvf/src/artifacts.rs b/node/core/pvf/src/artifacts.rs new file mode 100644 index 000000000000..be7cad57fd16 --- /dev/null +++ b/node/core/pvf/src/artifacts.rs @@ -0,0 +1,311 @@ +// Copyright 2021 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +use crate::LOG_TARGET; +use always_assert::always; +use async_std::{ + io, + path::{Path, PathBuf}, +}; +use polkadot_core_primitives::Hash; +use std::{ + collections::HashMap, + time::{Duration, SystemTime}, +}; +use parity_scale_codec::{Encode, Decode}; +use futures::StreamExt; + +/// A final product of preparation process. Contains either a ready to run compiled artifact or +/// a description what went wrong. +#[derive(Encode, Decode)] +pub enum Artifact { + /// During the prevalidation stage of preparation an issue was found with the PVF. + PrevalidationErr(String), + /// Compilation failed for the given PVF. + PreparationErr(String), + /// This state indicates that the process assigned to prepare the artifact wasn't responsible + /// or were killed. This state is reported by the validation host (not by the worker). + DidntMakeIt, + /// The PVF passed all the checks and is ready for execution. + Compiled { compiled_artifact: Vec }, +} + +impl Artifact { + /// Serializes this struct into a byte buffer. + pub fn serialize(&self) -> Vec { + self.encode() + } + + /// Deserialize the given byte buffer to an artifact. + pub fn deserialize(mut bytes: &[u8]) -> Result { + Artifact::decode(&mut bytes).map_err(|e| format!("{:?}", e)) + } +} + +/// Identifier of an artifact. Right now it only encodes a code hash of the PVF. But if we get to +/// multiple engine implementations the artifact ID should include the engine type as well. +#[derive(Clone, Debug, PartialEq, Eq, PartialOrd, Ord, Hash)] +pub struct ArtifactId { + code_hash: Hash, +} + +impl ArtifactId { + const PREFIX: &'static str = "wasmtime_1_"; + + /// Creates a new artifact ID with the given hash. + pub fn new(code_hash: Hash) -> Self { + Self { code_hash } + } + + /// Tries to recover the artifact id from the given file name. + pub fn from_file_name(file_name: &str) -> Option { + use std::str::FromStr as _; + + let file_name = file_name.strip_prefix(Self::PREFIX)?; + let code_hash = Hash::from_str(file_name).ok()?; + + Some(Self { code_hash }) + } + + /// Returns the expected path to this artifact given the root of the cache. + pub fn path(&self, cache_path: &Path) -> PathBuf { + let file_name = format!("{}{}", Self::PREFIX, self.code_hash.to_string()); + cache_path.join(file_name) + } +} + +pub enum ArtifactState { + /// The artifact is ready to be used by the executor. + /// + /// That means that the artifact should be accessible through the path obtained by the artifact + /// id (unless, it was removed externally). + Prepared { + /// The time when the artifact was the last time needed. + /// + /// This is updated when we get the heads up for this artifact or when we just discover + /// this file. + last_time_needed: SystemTime, + }, + /// A task to prepare this artifact is scheduled. + Preparing, +} + +/// A container of all known artifact ids and their states. +pub struct Artifacts { + artifacts: HashMap, +} + +impl Artifacts { + /// Scan the given cache root for the artifacts. + /// + /// The recognized artifacts will be filled in the table and unrecognized will be removed. + pub async fn new(cache_path: &Path) -> Self { + // Make sure that the cache path directory and all it's parents are created. + let _ = async_std::fs::create_dir_all(cache_path).await; + + let artifacts = match scan_for_known_artifacts(cache_path).await { + Ok(a) => a, + Err(err) => { + tracing::warn!( + target: LOG_TARGET, + "unable to seed the artifacts in memory cache: {:?}. Starting with a clean one", + err, + ); + HashMap::new() + } + }; + + Self { artifacts } + } + + #[cfg(test)] + pub(crate) fn empty() -> Self { + Self { + artifacts: HashMap::new(), + } + } + + /// Returns the state of the given artifact by its ID. + pub fn artifact_state_mut(&mut self, artifact_id: &ArtifactId) -> Option<&mut ArtifactState> { + self.artifacts.get_mut(artifact_id) + } + + /// Inform the table about the artifact with the given ID. The state will be set to "preparing". + /// + /// This function must be used only for brand new artifacts and should never be used for + /// replacing existing ones. + pub fn insert_preparing(&mut self, artifact_id: ArtifactId) { + // See the precondition. + always!(self + .artifacts + .insert(artifact_id, ArtifactState::Preparing) + .is_none()); + } + + /// Insert an artifact with the given ID as "prepared". + /// + /// This function must be used only for brand new artifacts and should never be used for + /// replacing existing ones. + #[cfg(test)] + pub fn insert_prepared(&mut self, artifact_id: ArtifactId, last_time_needed: SystemTime) { + // See the precondition. + always!(self + .artifacts + .insert(artifact_id, ArtifactState::Prepared { last_time_needed }) + .is_none()); + } + + /// Remove and retrive the artifacts from the table that are older than the supplied Time-To-Live. + pub fn prune(&mut self, artifact_ttl: Duration) -> Vec { + let now = SystemTime::now(); + + let mut to_remove = vec![]; + for (k, v) in self.artifacts.iter() { + if let ArtifactState::Prepared { + last_time_needed, .. + } = *v { + if now + .duration_since(last_time_needed) + .map(|age| age > artifact_ttl) + .unwrap_or(false) + { + to_remove.push(k.clone()); + } + } + } + + for artifact in &to_remove { + self.artifacts.remove(artifact); + } + + to_remove + } +} + +/// Goes over all files in the given directory, collecting all recognizable artifacts. All files +/// that do not look like artifacts are removed. +/// +/// All recognized artifacts will be created with the current datetime. +async fn scan_for_known_artifacts( + cache_path: &Path, +) -> io::Result> { + let mut result = HashMap::new(); + let now = SystemTime::now(); + + let mut dir = async_std::fs::read_dir(cache_path).await?; + while let Some(res) = dir.next().await { + let entry = res?; + + if entry.file_type().await?.is_dir() { + tracing::debug!( + target: LOG_TARGET, + "{} is a dir, and dirs do not belong to us. Removing", + entry.path().display(), + ); + let _ = async_std::fs::remove_dir_all(entry.path()).await; + } + + let path = entry.path(); + let file_name = match path.file_name() { + None => { + // A file without a file name? Weird, just skip it. + continue; + } + Some(file_name) => file_name, + }; + + let file_name = match file_name.to_str() { + None => { + tracing::debug!( + target: LOG_TARGET, + "{} is not utf-8. Removing", + path.display(), + ); + let _ = async_std::fs::remove_file(&path).await; + continue; + } + Some(file_name) => file_name, + }; + + let artifact_id = match ArtifactId::from_file_name(file_name) { + None => { + tracing::debug!( + target: LOG_TARGET, + "{} is not a recognized artifact. Removing", + path.display(), + ); + let _ = async_std::fs::remove_file(&path).await; + continue; + } + Some(artifact_id) => artifact_id, + }; + + // A sanity check so that we really can access the artifact through the artifact id. + if artifact_id.path(cache_path).is_file().await { + result.insert( + artifact_id, + ArtifactState::Prepared { + last_time_needed: now, + }, + ); + } else { + tracing::warn!( + target: LOG_TARGET, + "{} is not accessible by artifact_id {:?}", + cache_path.display(), + artifact_id, + ); + } + } + + Ok(result) +} + +#[cfg(test)] +mod tests { + use super::ArtifactId; + + #[test] + fn ensure_wasmtime_version() { + assert_eq!( + wasmtime_jit::VERSION, + "0.24.0", + "wasmtime version is updated. Check the prefix.", + ); + // If the version bump is significant, change `ArtifactId::PREFIX`. + // + // If in doubt bump it. This will lead to removal of the existing artifacts in the on-disk cache + // and recompilation. + } + + #[test] + fn from_file_name() { + assert!(ArtifactId::from_file_name("").is_none()); + assert!(ArtifactId::from_file_name("junk").is_none()); + + assert_eq!( + ArtifactId::from_file_name( + "wasmtime_1_0x0022800000000000000000000000000000000000000000000000000000000000" + ), + Some(ArtifactId::new( + hex_literal::hex![ + "0022800000000000000000000000000000000000000000000000000000000000" + ] + .into() + )), + ); + } +} diff --git a/node/core/pvf/src/error.rs b/node/core/pvf/src/error.rs new file mode 100644 index 000000000000..71377cdf3dc2 --- /dev/null +++ b/node/core/pvf/src/error.rs @@ -0,0 +1,56 @@ +// Copyright 2021 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +/// A error raised during validation of the candidate. +#[derive(Debug, Clone)] +pub enum ValidationError { + /// The error was raised because the candidate is invalid. + InvalidCandidate(InvalidCandidate), + /// This error is raised due to inability to serve the request. + InternalError(String), +} + +/// A description of an error raised during executing a PVF and can be attributed to the combination +/// of the candidate [`polkadot_parachain::primitives::ValidationParams`] and the PVF. +#[derive(Debug, Clone)] +pub enum InvalidCandidate { + /// The failure is reported by the worker. The string contains the error message. + /// + /// This also includes the errors reported by the preparation pipeline. + WorkerReportedError(String), + /// The worker has died during validation of a candidate. That may fall in one of the following + /// categories, which we cannot distinguish programmatically: + /// + /// (a) Some sort of transient glitch caused the worker process to abort. An example would be that + /// the host machine ran out of free memory and the OOM killer started killing the processes, + /// and in order to save the parent it will "sacrifice child" first. + /// + /// (b) The candidate triggered a code path that has lead to the process death. For example, + /// the PVF found a way to consume unbounded amount of resources and then it either exceeded + /// an rlimit (if set) or, again, invited OOM killer. Another possibility is a bug in + /// wasmtime allowed the PVF to gain control over the execution worker. + /// + /// We attribute such an event to an invalid candidate in either case. + /// + /// The rationale for this is that a glitch may lead to unfair rejecting candidate by a single + /// validator. If the glitch is somewhat more persistant the validator will reject all candidate + /// thrown at it and hopefully the operator notices it by decreased reward performance of the + /// validator. On the other hand, if the worker died because of (b) we would have better chances + /// to stop the attack. + AmbigiousWorkerDeath, + /// PVF execution (compilation is not included) took more time than was allotted. + HardTimeout, +} diff --git a/parachain/test-parachains/tests/lib.rs b/node/core/pvf/src/execute/mod.rs similarity index 53% rename from parachain/test-parachains/tests/lib.rs rename to node/core/pvf/src/execute/mod.rs index aef9877ffad1..f1580f9d668e 100644 --- a/parachain/test-parachains/tests/lib.rs +++ b/node/core/pvf/src/execute/mod.rs @@ -1,4 +1,4 @@ -// Copyright 2019-2020 Parity Technologies (UK) Ltd. +// Copyright 2021 Parity Technologies (UK) Ltd. // This file is part of Polkadot. // Polkadot is free software: you can redistribute it and/or modify @@ -14,17 +14,14 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . -mod adder; -mod wasm_executor; +//! Execution part of the pipeline. +//! +//! The validation host [runs the queue][`start`] communicating with it by sending [`ToQueue`] +//! messages. The queue will spawn workers in new processes. Those processes should jump to +//! [`worker_entrypoint`]. -use parachain::wasm_executor::run_worker; +mod queue; +mod worker; -// This is not an actual test, but rather an entry point for out-of process WASM executor. -// When executing tests the executor spawns currently executing binary, which happens to be test binary. -// It then passes "validation_worker" on CLI effectivly making rust test executor to run this single test. -#[test] -fn validation_worker() { - if let Some(id) = std::env::args().find(|a| a.starts_with("/shmem_")) { - run_worker(&id, None).unwrap() - } -} +pub use queue::{ToQueue, start}; +pub use worker::worker_entrypoint; diff --git a/node/core/pvf/src/execute/queue.rs b/node/core/pvf/src/execute/queue.rs new file mode 100644 index 000000000000..98aab605affc --- /dev/null +++ b/node/core/pvf/src/execute/queue.rs @@ -0,0 +1,344 @@ +// Copyright 2021 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +//! A queue that handles requests for PVF execution. + +use crate::{ + worker_common::{IdleWorker, WorkerHandle}, + host::ResultSender, + LOG_TARGET, InvalidCandidate, ValidationError, +}; +use super::worker::Outcome; +use std::{collections::VecDeque, fmt, time::Duration}; +use futures::{ + Future, FutureExt, + channel::mpsc, + future::BoxFuture, + stream::{FuturesUnordered, StreamExt as _}, +}; +use async_std::path::PathBuf; +use slotmap::HopSlotMap; + +slotmap::new_key_type! { struct Worker; } + +#[derive(Debug)] +pub enum ToQueue { + Enqueue { + artifact_path: PathBuf, + params: Vec, + result_tx: ResultSender, + }, +} + +struct ExecuteJob { + artifact_path: PathBuf, + params: Vec, + result_tx: ResultSender, +} + +struct WorkerData { + idle: Option, + handle: WorkerHandle, +} + +impl fmt::Debug for WorkerData { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "WorkerData(pid={})", self.handle.id()) + } +} + +struct Workers { + /// The registry of running workers. + running: HopSlotMap, + + /// The number of spawning but not yet spawned workers. + spawn_inflight: usize, + + /// The maximum number of workers queue can have at once. + capacity: usize, +} + +impl Workers { + fn can_afford_one_more(&self) -> bool { + self.spawn_inflight + self.running.len() < self.capacity + } + + fn find_available(&self) -> Option { + self.running + .iter() + .find_map(|d| if d.1.idle.is_some() { Some(d.0) } else { None }) + } + + /// Find the associated data by the worker token and extract it's [`IdleWorker`] token. + /// + /// Returns `None` if either worker is not recognized or idle token is absent. + fn claim_idle(&mut self, worker: Worker) -> Option { + self + .running + .get_mut(worker)? + .idle + .take() + } +} + +enum QueueEvent { + Spawn((IdleWorker, WorkerHandle)), + StartWork(Worker, Outcome, ResultSender), +} + +type Mux = FuturesUnordered>; + +struct Queue { + /// The receiver that receives messages to the pool. + to_queue_rx: mpsc::Receiver, + + program_path: PathBuf, + spawn_timeout: Duration, + + /// The queue of jobs that are waiting for a worker to pick up. + queue: VecDeque, + workers: Workers, + mux: Mux, +} + +impl Queue { + fn new( + program_path: PathBuf, + worker_capacity: usize, + spawn_timeout: Duration, + to_queue_rx: mpsc::Receiver, + ) -> Self { + Self { + program_path, + spawn_timeout, + to_queue_rx, + queue: VecDeque::new(), + mux: Mux::new(), + workers: Workers { + running: HopSlotMap::with_capacity_and_key(10), + spawn_inflight: 0, + capacity: worker_capacity, + }, + } + } + + async fn run(mut self) { + loop { + futures::select! { + to_queue = self.to_queue_rx.next() => { + if let Some(to_queue) = to_queue { + handle_to_queue(&mut self, to_queue); + } else { + break; + } + } + ev = self.mux.select_next_some() => handle_mux(&mut self, ev).await, + } + + purge_dead(&mut self.workers).await; + } + } +} + +async fn purge_dead(workers: &mut Workers) { + let mut to_remove = vec![]; + for (worker, data) in workers.running.iter_mut() { + if futures::poll!(&mut data.handle).is_ready() { + // a resolved future means that the worker has terminated. Weed it out. + to_remove.push(worker); + } + } + for w in to_remove { + let _ = workers.running.remove(w); + } +} + +fn handle_to_queue(queue: &mut Queue, to_queue: ToQueue) { + let ToQueue::Enqueue { + artifact_path, + params, + result_tx, + } = to_queue; + + let job = ExecuteJob { + artifact_path, + params, + result_tx, + }; + + if let Some(available) = queue.workers.find_available() { + assign(queue, available, job); + } else { + if queue.workers.can_afford_one_more() { + spawn_extra_worker(queue); + } + queue.queue.push_back(job); + } +} + +async fn handle_mux(queue: &mut Queue, event: QueueEvent) { + match event { + QueueEvent::Spawn((idle, handle)) => { + queue.workers.spawn_inflight -= 1; + + let worker = queue.workers.running.insert(WorkerData { + idle: Some(idle), + handle, + }); + + if let Some(job) = queue.queue.pop_front() { + assign(queue, worker, job); + } + } + QueueEvent::StartWork(worker, outcome, result_tx) => { + handle_job_finish(queue, worker, outcome, result_tx); + } + } +} + +/// If there are pending jobs in the queue, schedules the next of them onto the just freed up +/// worker. Otherwise, puts back into the available workers list. +fn handle_job_finish(queue: &mut Queue, worker: Worker, outcome: Outcome, result_tx: ResultSender) { + let (idle_worker, result) = match outcome { + Outcome::Ok { + result_descriptor, + duration_ms, + idle_worker, + } => { + // TODO: propagate the soft timeout + drop(duration_ms); + + (Some(idle_worker), Ok(result_descriptor)) + } + Outcome::InvalidCandidate { err, idle_worker } => ( + Some(idle_worker), + Err(ValidationError::InvalidCandidate( + InvalidCandidate::WorkerReportedError(err), + )), + ), + Outcome::InternalError { err, idle_worker } => ( + Some(idle_worker), + Err(ValidationError::InternalError(err)), + ), + Outcome::HardTimeout => ( + None, + Err(ValidationError::InvalidCandidate( + InvalidCandidate::HardTimeout, + )), + ), + Outcome::IoErr => ( + None, + Err(ValidationError::InvalidCandidate( + InvalidCandidate::AmbigiousWorkerDeath, + )), + ), + }; + + // First we send the result. It may fail due the other end of the channel being dropped, that's + // legitimate and we don't treat that as an error. + let _ = result_tx.send(result); + + // Then, we should deal with the worker: + // + // - if the `idle_worker` token was returned we should either schedule the next task or just put + // it back so that the next incoming job will be able to claim it + // + // - if the `idle_worker` token was consumed, all the metadata pertaining to that worker should + // be removed. + if let Some(idle_worker) = idle_worker { + if let Some(data) = queue.workers.running.get_mut(worker) { + data.idle = Some(idle_worker); + + if let Some(job) = queue.queue.pop_front() { + assign(queue, worker, job); + } + } + } else { + // Note it's possible that the worker was purged already by `purge_dead` + queue.workers.running.remove(worker); + + if !queue.queue.is_empty() { + // The worker has died and we still have work we have to do. Request an extra worker. + // + // That can potentially overshoot, but that should be OK. + spawn_extra_worker(queue); + } + } +} + +fn spawn_extra_worker(queue: &mut Queue) { + queue + .mux + .push(spawn_worker_task(queue.program_path.clone(), queue.spawn_timeout).boxed()); + queue.workers.spawn_inflight += 1; +} + +async fn spawn_worker_task(program_path: PathBuf, spawn_timeout: Duration) -> QueueEvent { + use futures_timer::Delay; + + loop { + match super::worker::spawn(&program_path, spawn_timeout).await { + Ok((idle, handle)) => break QueueEvent::Spawn((idle, handle)), + Err(err) => { + tracing::warn!( + target: LOG_TARGET, + "failed to spawn an execute worker: {:?}", + err, + ); + + // Assume that the failure intermittent and retry after a delay. + Delay::new(Duration::from_secs(3)).await; + } + } + } +} + +/// Ask the given worker to perform the given job. +/// +/// The worker must be running and idle. +fn assign(queue: &mut Queue, worker: Worker, job: ExecuteJob) { + let idle = queue + .workers + .claim_idle(worker) + .expect( + "this caller must supply a worker which is idle and running; + thus claim_idle cannot return None; + qed." + ); + queue.mux.push( + async move { + let outcome = super::worker::start_work(idle, job.artifact_path, job.params).await; + QueueEvent::StartWork(worker, outcome, job.result_tx) + } + .boxed(), + ); +} + +pub fn start( + program_path: PathBuf, + worker_capacity: usize, + spawn_timeout: Duration, +) -> (mpsc::Sender, impl Future) { + let (to_queue_tx, to_queue_rx) = mpsc::channel(20); + let run = Queue::new( + program_path, + worker_capacity, + spawn_timeout, + to_queue_rx, + ) + .run(); + (to_queue_tx, run) +} diff --git a/node/core/pvf/src/execute/worker.rs b/node/core/pvf/src/execute/worker.rs new file mode 100644 index 000000000000..e2d6f3f5623f --- /dev/null +++ b/node/core/pvf/src/execute/worker.rs @@ -0,0 +1,272 @@ +// Copyright 2021 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +use crate::{ + artifacts::Artifact, + LOG_TARGET, + executor_intf::TaskExecutor, + worker_common::{ + IdleWorker, SpawnErr, WorkerHandle, bytes_to_path, framed_recv, framed_send, path_to_bytes, + spawn_with_program_path, worker_event_loop, + }, +}; +use std::time::{Duration, Instant}; +use async_std::{ + io, + os::unix::net::UnixStream, + path::{Path, PathBuf}, +}; +use futures::FutureExt; +use futures_timer::Delay; +use polkadot_parachain::primitives::ValidationResult; +use parity_scale_codec::{Encode, Decode}; + +const EXECUTION_TIMEOUT: Duration = Duration::from_secs(3); + +/// Spawns a new worker with the given program path that acts as the worker and the spawn timeout. +/// +/// The program should be able to handle ` execute-worker ` invocation. +pub async fn spawn( + program_path: &Path, + spawn_timeout: Duration, +) -> Result<(IdleWorker, WorkerHandle), SpawnErr> { + spawn_with_program_path( + "execute", + program_path, + &["execute-worker"], + spawn_timeout, + ) + .await +} + +/// Outcome of PVF execution. +pub enum Outcome { + /// PVF execution completed successfully and the result is returned. The worker is ready for + /// another job. + Ok { + result_descriptor: ValidationResult, + duration_ms: u64, + idle_worker: IdleWorker, + }, + /// The candidate validation failed. It may be for example because the preparation process + /// produced an error or the wasm execution triggered a trap. + InvalidCandidate { + err: String, + idle_worker: IdleWorker, + }, + /// An internal error happened during the validation. Such an error is most likely related to + /// some transient glitch. + InternalError { + err: String, + idle_worker: IdleWorker, + }, + /// The execution time exceeded the hard limit. The worker is terminated. + HardTimeout, + /// An I/O error happened during communication with the worker. This may mean that the worker + /// process already died. The token is not returned in any case. + IoErr, +} + +/// Given the idle token of a worker and parameters of work, communicates with the worker and +/// returns the outcome. +pub async fn start_work( + worker: IdleWorker, + artifact_path: PathBuf, + validation_params: Vec, +) -> Outcome { + let IdleWorker { mut stream, pid } = worker; + + tracing::debug!( + target: LOG_TARGET, + worker_pid = %pid, + "starting execute for {}", + artifact_path.display(), + ); + + if send_request(&mut stream, &artifact_path, &validation_params).await.is_err() { + return Outcome::IoErr; + } + + let response = futures::select! { + response = recv_response(&mut stream).fuse() => { + match response { + Err(_err) => return Outcome::IoErr, + Ok(response) => response, + } + }, + _ = Delay::new(EXECUTION_TIMEOUT).fuse() => return Outcome::HardTimeout, + }; + + match response { + Response::Ok { + result_descriptor, + duration_ms, + } => Outcome::Ok { + result_descriptor, + duration_ms, + idle_worker: IdleWorker { stream, pid }, + }, + Response::InvalidCandidate(err) => Outcome::InvalidCandidate { + err, + idle_worker: IdleWorker { stream, pid }, + }, + Response::InternalError(err) => Outcome::InternalError { + err, + idle_worker: IdleWorker { stream, pid }, + }, + } +} + +async fn send_request( + stream: &mut UnixStream, + artifact_path: &Path, + validation_params: &[u8], +) -> io::Result<()> { + framed_send(stream, path_to_bytes(artifact_path)).await?; + framed_send(stream, validation_params).await +} + +async fn recv_request(stream: &mut UnixStream) -> io::Result<(PathBuf, Vec)> { + let artifact_path = framed_recv(stream).await?; + let artifact_path = bytes_to_path(&artifact_path).ok_or_else(|| { + io::Error::new( + io::ErrorKind::Other, + "execute pvf recv_request: non utf-8 artifact path".to_string(), + ) + })?; + let params = framed_recv(stream).await?; + Ok((artifact_path, params)) +} + +async fn send_response(stream: &mut UnixStream, response: Response) -> io::Result<()> { + framed_send(stream, &response.encode()).await +} + +async fn recv_response(stream: &mut UnixStream) -> io::Result { + let response_bytes = framed_recv(stream).await?; + Response::decode(&mut &response_bytes[..]).map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("execute pvf recv_response: decode error: {:?}", e), + ) + }) +} + +#[derive(Encode, Decode)] +enum Response { + Ok { + result_descriptor: ValidationResult, + duration_ms: u64, + }, + InvalidCandidate(String), + InternalError(String), +} + +impl Response { + fn format_invalid(ctx: &'static str, msg: &str) -> Self { + if msg.is_empty() { + Self::InvalidCandidate(ctx.to_string()) + } else { + Self::InvalidCandidate(format!("{}: {}", ctx, msg)) + } + } +} + +/// The entrypoint that the spawned execute worker should start with. The socket_path specifies +/// the path to the socket used to communicate with the host. +pub fn worker_entrypoint(socket_path: &str) { + worker_event_loop("execute", socket_path, |mut stream| async move { + let executor = TaskExecutor::new().map_err(|e| { + io::Error::new( + io::ErrorKind::Other, + format!("cannot create task executor: {}", e), + ) + })?; + loop { + let (artifact_path, params) = recv_request(&mut stream).await?; + tracing::debug!( + target: LOG_TARGET, + worker_pid = %std::process::id(), + "worker: validating artifact {}", + artifact_path.display(), + ); + let response = validate_using_artifact(&artifact_path, ¶ms, &executor).await; + send_response(&mut stream, response).await?; + } + }); +} + +async fn validate_using_artifact( + artifact_path: &Path, + params: &[u8], + spawner: &TaskExecutor, +) -> Response { + let artifact_bytes = match async_std::fs::read(artifact_path).await { + Err(e) => { + return Response::InternalError(format!( + "failed to read the artifact at {}: {:?}", + artifact_path.display(), + e, + )) + } + Ok(b) => b, + }; + + let artifact = match Artifact::deserialize(&artifact_bytes) { + Err(e) => return Response::InternalError(format!("artifact deserialization: {:?}", e)), + Ok(a) => a, + }; + + let compiled_artifact = match &artifact { + Artifact::PrevalidationErr(msg) => { + return Response::format_invalid("prevalidation", msg); + } + Artifact::PreparationErr(msg) => { + return Response::format_invalid("preparation", msg); + } + Artifact::DidntMakeIt => { + return Response::format_invalid("preparation timeout", ""); + } + + Artifact::Compiled { compiled_artifact } => compiled_artifact, + }; + + let validation_started_at = Instant::now(); + let descriptor_bytes = + match crate::executor_intf::execute(compiled_artifact, params, spawner.clone()) { + Err(err) => { + return Response::format_invalid("execute", &err.to_string()); + } + Ok(d) => d, + }; + + let duration_ms = validation_started_at.elapsed().as_millis() as u64; + + let result_descriptor = match ValidationResult::decode(&mut &descriptor_bytes[..]) { + Err(err) => { + return Response::InvalidCandidate(format!( + "validation result decoding failed: {}", + err + )) + } + Ok(r) => r, + }; + + Response::Ok { + result_descriptor, + duration_ms, + } +} diff --git a/node/core/pvf/src/executor_intf.rs b/node/core/pvf/src/executor_intf.rs new file mode 100644 index 000000000000..5318589da074 --- /dev/null +++ b/node/core/pvf/src/executor_intf.rs @@ -0,0 +1,239 @@ +// Copyright 2021 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +//! Interface to the Substrate Executor + +use std::any::{TypeId, Any}; +use sc_executor_common::{ + runtime_blob::RuntimeBlob, + wasm_runtime::{InvokeMethod, WasmModule as _}, +}; +use sc_executor_wasmtime::{Config, Semantics}; +use sp_core::{ + storage::{ChildInfo, TrackedStorageKey}, +}; +use sp_wasm_interface::HostFunctions as _; + +const CONFIG: Config = Config { + // TODO: Make sure we don't use more than 1GB: https://github.com/paritytech/polkadot/issues/699 + heap_pages: 1024, + allow_missing_func_imports: true, + cache_path: None, + semantics: Semantics { + fast_instance_reuse: false, + stack_depth_metering: false, + }, +}; + +/// Runs the prevaldation on the given code. Returns a [`RuntimeBlob`] if it succeeds. +pub fn prevalidate(code: &[u8]) -> Result { + let blob = RuntimeBlob::new(code)?; + // It's assumed this function will take care of any prevalidation logic + // that needs to be done. + // + // Do nothing for now. + Ok(blob) +} + +/// Runs preparation on the given runtime blob. If successful, it returns a serialized compiled +/// artifact which can then be used to pass into [`execute`]. +pub fn prepare(blob: RuntimeBlob) -> Result, sc_executor_common::error::WasmError> { + sc_executor_wasmtime::prepare_runtime_artifact(blob, &CONFIG.semantics) +} + +/// Executes the given PVF in the form of a compiled artifact and returns the result of execution +/// upon success. +pub fn execute( + compiled_artifact: &[u8], + params: &[u8], + spawner: impl sp_core::traits::SpawnNamed + 'static, +) -> Result, sc_executor_common::error::Error> { + let mut extensions = sp_externalities::Extensions::new(); + + extensions.register(sp_core::traits::TaskExecutorExt::new(spawner)); + + let mut ext = ValidationExternalities(extensions); + + sc_executor::with_externalities_safe(&mut ext, || { + let runtime = sc_executor_wasmtime::create_runtime( + sc_executor_wasmtime::CodeSupplyMode::Artifact { compiled_artifact }, + CONFIG, + HostFunctions::host_functions(), + )?; + runtime + .new_instance()? + .call(InvokeMethod::Export("validate_block"), params) + })? +} + +type HostFunctions = sp_io::SubstrateHostFunctions; + +/// The validation externalities that will panic on any storage related access. +struct ValidationExternalities(sp_externalities::Extensions); + +impl sp_externalities::Externalities for ValidationExternalities { + fn storage(&self, _: &[u8]) -> Option> { + panic!("storage: unsupported feature for parachain validation") + } + + fn storage_hash(&self, _: &[u8]) -> Option> { + panic!("storage_hash: unsupported feature for parachain validation") + } + + fn child_storage_hash(&self, _: &ChildInfo, _: &[u8]) -> Option> { + panic!("child_storage_hash: unsupported feature for parachain validation") + } + + fn child_storage(&self, _: &ChildInfo, _: &[u8]) -> Option> { + panic!("child_storage: unsupported feature for parachain validation") + } + + fn kill_child_storage(&mut self, _: &ChildInfo, _: Option) -> (bool, u32) { + panic!("kill_child_storage: unsupported feature for parachain validation") + } + + fn clear_prefix(&mut self, _: &[u8]) { + panic!("clear_prefix: unsupported feature for parachain validation") + } + + fn clear_child_prefix(&mut self, _: &ChildInfo, _: &[u8]) { + panic!("clear_child_prefix: unsupported feature for parachain validation") + } + + fn place_storage(&mut self, _: Vec, _: Option>) { + panic!("place_storage: unsupported feature for parachain validation") + } + + fn place_child_storage(&mut self, _: &ChildInfo, _: Vec, _: Option>) { + panic!("place_child_storage: unsupported feature for parachain validation") + } + + fn storage_root(&mut self) -> Vec { + panic!("storage_root: unsupported feature for parachain validation") + } + + fn child_storage_root(&mut self, _: &ChildInfo) -> Vec { + panic!("child_storage_root: unsupported feature for parachain validation") + } + + fn storage_changes_root(&mut self, _: &[u8]) -> Result>, ()> { + panic!("storage_changes_root: unsupported feature for parachain validation") + } + + fn next_child_storage_key(&self, _: &ChildInfo, _: &[u8]) -> Option> { + panic!("next_child_storage_key: unsupported feature for parachain validation") + } + + fn next_storage_key(&self, _: &[u8]) -> Option> { + panic!("next_storage_key: unsupported feature for parachain validation") + } + + fn storage_append(&mut self, _key: Vec, _value: Vec) { + panic!("storage_append: unsupported feature for parachain validation") + } + + fn storage_start_transaction(&mut self) { + panic!("storage_start_transaction: unsupported feature for parachain validation") + } + + fn storage_rollback_transaction(&mut self) -> Result<(), ()> { + panic!("storage_rollback_transaction: unsupported feature for parachain validation") + } + + fn storage_commit_transaction(&mut self) -> Result<(), ()> { + panic!("storage_commit_transaction: unsupported feature for parachain validation") + } + + fn wipe(&mut self) { + panic!("wipe: unsupported feature for parachain validation") + } + + fn commit(&mut self) { + panic!("commit: unsupported feature for parachain validation") + } + + fn read_write_count(&self) -> (u32, u32, u32, u32) { + panic!("read_write_count: unsupported feature for parachain validation") + } + + fn reset_read_write_count(&mut self) { + panic!("reset_read_write_count: unsupported feature for parachain validation") + } + + fn get_whitelist(&self) -> Vec { + panic!("get_whitelist: unsupported feature for parachain validation") + } + + fn set_whitelist(&mut self, _: Vec) { + panic!("set_whitelist: unsupported feature for parachain validation") + } + + fn set_offchain_storage(&mut self, _: &[u8], _: std::option::Option<&[u8]>) { + panic!("set_offchain_storage: unsupported feature for parachain validation") + } +} + +impl sp_externalities::ExtensionStore for ValidationExternalities { + fn extension_by_type_id(&mut self, type_id: TypeId) -> Option<&mut dyn Any> { + self.0.get_mut(type_id) + } + + fn register_extension_with_type_id( + &mut self, + type_id: TypeId, + extension: Box, + ) -> Result<(), sp_externalities::Error> { + self.0.register_with_type_id(type_id, extension) + } + + fn deregister_extension_by_type_id( + &mut self, + type_id: TypeId, + ) -> Result<(), sp_externalities::Error> { + if self.0.deregister(type_id) { + Ok(()) + } else { + Err(sp_externalities::Error::ExtensionIsNotRegistered(type_id)) + } + } +} + +/// An implementation of `SpawnNamed` on top of a futures' thread pool. +/// +/// This is a light handle meaning it will only clone the handle not create a new thread pool. +#[derive(Clone)] +pub(crate) struct TaskExecutor(futures::executor::ThreadPool); + +impl TaskExecutor { + pub(crate) fn new() -> Result { + futures::executor::ThreadPoolBuilder::new() + .pool_size(4) + .name_prefix("pvf-task-executor") + .create() + .map_err(|e| e.to_string()) + .map(Self) + } +} + +impl sp_core::traits::SpawnNamed for TaskExecutor { + fn spawn_blocking(&self, _: &'static str, future: futures::future::BoxFuture<'static, ()>) { + self.0.spawn_ok(future); + } + + fn spawn(&self, _: &'static str, future: futures::future::BoxFuture<'static, ()>) { + self.0.spawn_ok(future); + } +} diff --git a/node/core/pvf/src/host.rs b/node/core/pvf/src/host.rs new file mode 100644 index 000000000000..4197acccb3a3 --- /dev/null +++ b/node/core/pvf/src/host.rs @@ -0,0 +1,1039 @@ +// Copyright 2021 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +//! Validation host - is the primary interface for this crate. It allows the clients to enqueue +//! jobs for PVF execution or preparation. +//! +//! The validation host is represented by a future/task that runs an event-loop and by a handle, +//! [`ValidationHost`], that allows communication with that event-loop. + +use crate::{ + Priority, Pvf, ValidationError, + artifacts::{Artifacts, ArtifactState, ArtifactId}, + execute, prepare, +}; +use std::{ + collections::HashMap, + time::{Duration, SystemTime}, +}; +use always_assert::never; +use async_std::{ + path::{Path, PathBuf}, +}; +use polkadot_parachain::primitives::ValidationResult; +use futures::{ + Future, FutureExt, SinkExt, StreamExt, + channel::{mpsc, oneshot}, +}; + +/// An alias to not spell the type for the oneshot sender for the PVF execution result. +pub(crate) type ResultSender = oneshot::Sender>; + +/// A handle to the async process serving the validation host requests. +#[derive(Clone)] +pub struct ValidationHost { + to_host_tx: mpsc::Sender, +} + +impl ValidationHost { + /// Execute PVF with the given code, params and priority. The result of execution will be sent + /// to the provided result sender. + /// + /// This is async to accomodate the fact a possibility of back-pressure. In vast majority of + /// situations this function should return immediatelly. + /// + /// Returns an error if the request cannot be sent to the validation host, i.e. if it shut down. + pub async fn execute_pvf( + &mut self, + pvf: Pvf, + params: Vec, + priority: Priority, + result_tx: ResultSender, + ) -> Result<(), String> { + self.to_host_tx + .send(ToHost::ExecutePvf { + pvf, + params, + priority, + result_tx, + }) + .await + .map_err(|_| "the inner loop hung up".to_string()) + } + + /// Sends a signal to the validation host requesting to prepare a list of the given PVFs. + /// + /// This is async to accomodate the fact a possibility of back-pressure. In vast majority of + /// situations this function should return immediatelly. + /// + /// Returns an error if the request cannot be sent to the validation host, i.e. if it shut down. + pub async fn heads_up(&mut self, active_pvfs: Vec) -> Result<(), String> { + self.to_host_tx + .send(ToHost::HeadsUp { active_pvfs }) + .await + .map_err(|_| "the inner loop hung up".to_string()) + } +} + +enum ToHost { + ExecutePvf { + pvf: Pvf, + params: Vec, + priority: Priority, + result_tx: ResultSender, + }, + HeadsUp { + active_pvfs: Vec, + }, +} + +/// Configuration for the validation host. +pub struct Config { + /// The root directory where the prepared artifacts can be stored. + pub cache_path: PathBuf, + /// The path to the program that can be used to spawn the prepare workers. + pub prepare_worker_program_path: PathBuf, + /// The time alloted for a prepare worker to spawn and report to the host. + pub prepare_worker_spawn_timeout: Duration, + /// The maximum number of workers that can be spawned in the prepare pool for tasks with the + /// priority below critical. + pub prepare_workers_soft_max_num: usize, + /// The absolute number of workers that can be spawned in the prepare pool. + pub prepare_workers_hard_max_num: usize, + /// The path to the program that can be used to spawn the execute workers. + pub execute_worker_program_path: PathBuf, + /// The time alloted for an execute worker to spawn and report to the host. + pub execute_worker_spawn_timeout: Duration, + /// The maximum number of execute workers that can run at the same time. + pub execute_workers_max_num: usize, +} + +impl Config { + /// Create a new instance of the configuration. + pub fn new(cache_path: std::path::PathBuf, program_path: std::path::PathBuf) -> Self { + // Do not contaminate the other parts of the codebase with the types from async_std. + let cache_path = PathBuf::from(cache_path); + let program_path = PathBuf::from(program_path); + + Self { + cache_path, + prepare_worker_program_path: program_path.clone(), + prepare_worker_spawn_timeout: Duration::from_secs(3), + prepare_workers_soft_max_num: 8, + prepare_workers_hard_max_num: 5, + execute_worker_program_path: program_path, + execute_worker_spawn_timeout: Duration::from_secs(3), + execute_workers_max_num: 5, + } + } +} + +/// Start the validation host. +/// +/// Returns a [handle][`ValidationHost`] to the started validation host and the future. The future +/// must be polled in order for validation host to function. +/// +/// The future should not return normally but if it does then that indicates an unrecoverable error. +/// In that case all pending requests will be cancelled, dropping the result senders and new ones +/// will be rejected. +pub fn start(config: Config) -> (ValidationHost, impl Future) { + let (to_host_tx, to_host_rx) = mpsc::channel(10); + + let validation_host = ValidationHost { to_host_tx }; + + let (to_prepare_pool, from_prepare_pool, run_prepare_pool) = prepare::start_pool( + config.prepare_worker_program_path.to_owned(), + config.prepare_worker_spawn_timeout, + ); + + let (to_prepare_queue_tx, from_prepare_queue_rx, run_prepare_queue) = prepare::start_queue( + config.prepare_workers_soft_max_num, + config.prepare_workers_hard_max_num, + config.cache_path.clone(), + to_prepare_pool, + from_prepare_pool, + ); + + let (to_execute_queue_tx, run_execute_queue) = execute::start( + config.execute_worker_program_path.to_owned(), + config.execute_workers_max_num, + config.execute_worker_spawn_timeout, + ); + + let (to_sweeper_tx, to_sweeper_rx) = mpsc::channel(100); + let run_sweeper = sweeper_task(to_sweeper_rx); + + let run = async move { + let artifacts = Artifacts::new(&config.cache_path).await; + + futures::pin_mut!( + run_prepare_queue, + run_prepare_pool, + run_execute_queue, + run_sweeper + ); + + run( + Inner { + cache_path: config.cache_path, + cleanup_pulse_interval: Duration::from_secs(3600), + artifact_ttl: Duration::from_secs(3600 * 24), + artifacts, + to_host_rx, + to_prepare_queue_tx, + from_prepare_queue_rx, + to_execute_queue_tx, + to_sweeper_tx, + awaiting_prepare: AwaitingPrepare::default(), + }, + run_prepare_pool, + run_prepare_queue, + run_execute_queue, + run_sweeper, + ) + .await + }; + + (validation_host, run) +} + +/// An execution request that should execute the PVF (known in the context) and send the results +/// to the given result sender. +#[derive(Debug)] +struct PendingExecutionRequest { + params: Vec, + result_tx: ResultSender, +} + +/// A mapping from an artifact ID which is in preparation state to the list of pending exeuction +/// requests that should be executed once the artifact's prepration is finished. +#[derive(Default)] +struct AwaitingPrepare(HashMap>); + +impl AwaitingPrepare { + fn add(&mut self, artifact_id: ArtifactId, params: Vec, result_tx: ResultSender) { + self.0 + .entry(artifact_id) + .or_default() + .push(PendingExecutionRequest { params, result_tx }); + } + + fn take(&mut self, artifact_id: &ArtifactId) -> Vec { + self.0.remove(artifact_id).unwrap_or_default() + } +} + +struct Inner { + cache_path: PathBuf, + cleanup_pulse_interval: Duration, + artifact_ttl: Duration, + artifacts: Artifacts, + + to_host_rx: mpsc::Receiver, + + to_prepare_queue_tx: mpsc::Sender, + from_prepare_queue_rx: mpsc::UnboundedReceiver, + + to_execute_queue_tx: mpsc::Sender, + to_sweeper_tx: mpsc::Sender, + + awaiting_prepare: AwaitingPrepare, +} + +#[derive(Debug)] +struct Fatal; + +async fn run( + Inner { + cache_path, + cleanup_pulse_interval, + artifact_ttl, + mut artifacts, + to_host_rx, + from_prepare_queue_rx, + mut to_prepare_queue_tx, + mut to_execute_queue_tx, + mut to_sweeper_tx, + mut awaiting_prepare, + }: Inner, + prepare_pool: impl Future + Unpin, + prepare_queue: impl Future + Unpin, + execute_queue: impl Future + Unpin, + sweeper: impl Future + Unpin, +) { + macro_rules! break_if_fatal { + ($expr:expr) => { + match $expr { + Err(Fatal) => break, + Ok(v) => v, + } + }; + } + + let cleanup_pulse = pulse_every(cleanup_pulse_interval).fuse(); + futures::pin_mut!(cleanup_pulse); + + let mut to_host_rx = to_host_rx.fuse(); + let mut from_prepare_queue_rx = from_prepare_queue_rx.fuse(); + + // Make sure that the task-futures are fused. + let mut prepare_queue = prepare_queue.fuse(); + let mut prepare_pool = prepare_pool.fuse(); + let mut execute_queue = execute_queue.fuse(); + let mut sweeper = sweeper.fuse(); + + loop { + // biased to make it behave deterministically for tests. + futures::select_biased! { + _ = prepare_queue => { + never!("prepare_pool: long-running task never concludes; qed"); + break; + }, + _ = prepare_pool => { + never!("prepare_pool: long-running task never concludes; qed"); + break; + }, + _ = execute_queue => { + never!("execute_queue: long-running task never concludes; qed"); + break; + }, + _ = sweeper => { + never!("sweeper: long-running task never concludes; qed"); + break; + }, + () = cleanup_pulse.select_next_some() => { + // `select_next_some` because we don't expect this to fail, but if it does, we + // still don't fail. The tradeoff is that the compiled cache will start growing + // in size. That is, however, rather a slow process and hopefully the operator + // will notice it. + + break_if_fatal!(handle_cleanup_pulse( + &cache_path, + &mut to_sweeper_tx, + &mut artifacts, + artifact_ttl, + ).await); + }, + to_host = to_host_rx.next() => { + let to_host = break_if_fatal!(to_host.ok_or(Fatal)); + + break_if_fatal!(handle_to_host( + &cache_path, + &mut artifacts, + &mut to_prepare_queue_tx, + &mut to_execute_queue_tx, + &mut awaiting_prepare, + to_host, + ) + .await); + }, + from_prepare_queue = from_prepare_queue_rx.next() => { + let prepare::FromQueue::Prepared(artifact_id) + = break_if_fatal!(from_prepare_queue.ok_or(Fatal)); + + // Note that preparation always succeeds. + // + // That's because the error conditions are written into the artifact and will be + // reported at the time of the execution. It potentially, but not necessarily, + // can be scheduled as a result of this function call, in case there are pending + // executions. + // + // We could be eager in terms of reporting and plumb the result from the prepartion + // worker but we don't for the sake of simplicity. + break_if_fatal!(handle_prepare_done( + &cache_path, + &mut artifacts, + &mut to_execute_queue_tx, + &mut awaiting_prepare, + artifact_id, + ).await); + }, + } + } +} + +async fn handle_to_host( + cache_path: &Path, + artifacts: &mut Artifacts, + prepare_queue: &mut mpsc::Sender, + execute_queue: &mut mpsc::Sender, + awaiting_prepare: &mut AwaitingPrepare, + to_host: ToHost, +) -> Result<(), Fatal> { + match to_host { + ToHost::ExecutePvf { + pvf, + params, + priority, + result_tx, + } => { + handle_execute_pvf( + cache_path, + artifacts, + prepare_queue, + execute_queue, + awaiting_prepare, + pvf, + params, + priority, + result_tx, + ) + .await?; + } + ToHost::HeadsUp { active_pvfs } => { + handle_heads_up(artifacts, prepare_queue, active_pvfs).await?; + } + } + + Ok(()) +} + +async fn handle_execute_pvf( + cache_path: &Path, + artifacts: &mut Artifacts, + prepare_queue: &mut mpsc::Sender, + execute_queue: &mut mpsc::Sender, + awaiting_prepare: &mut AwaitingPrepare, + pvf: Pvf, + params: Vec, + priority: Priority, + result_tx: ResultSender, +) -> Result<(), Fatal> { + let artifact_id = pvf.as_artifact_id(); + + if let Some(state) = artifacts.artifact_state_mut(&artifact_id) { + match state { + ArtifactState::Prepared { + ref mut last_time_needed, + } => { + *last_time_needed = SystemTime::now(); + + send_execute( + execute_queue, + execute::ToQueue::Enqueue { + artifact_path: artifact_id.path(cache_path), + params, + result_tx, + }, + ) + .await?; + } + ArtifactState::Preparing => { + send_prepare( + prepare_queue, + prepare::ToQueue::Amend { + priority, + artifact_id: artifact_id.clone(), + }, + ) + .await?; + + awaiting_prepare.add(artifact_id, params, result_tx); + } + } + } else { + // Artifact is unknown: register it and enqueue a job with the corresponding priority and + // + artifacts.insert_preparing(artifact_id.clone()); + send_prepare(prepare_queue, prepare::ToQueue::Enqueue { priority, pvf }).await?; + + awaiting_prepare.add(artifact_id, params, result_tx); + } + + return Ok(()); +} + +async fn handle_heads_up( + artifacts: &mut Artifacts, + prepare_queue: &mut mpsc::Sender, + active_pvfs: Vec, +) -> Result<(), Fatal> { + let now = SystemTime::now(); + + for active_pvf in active_pvfs { + let artifact_id = active_pvf.as_artifact_id(); + if let Some(state) = artifacts.artifact_state_mut(&artifact_id) { + match state { + ArtifactState::Prepared { + last_time_needed, .. + } => { + *last_time_needed = now; + } + ArtifactState::Preparing => { + // Already preparing. We don't need to send a priority amend either because + // it can't get any lower than the background. + } + } + } else { + // The artifact is unknown: register it and put a background job into the prepare queue. + artifacts.insert_preparing(artifact_id.clone()); + + send_prepare( + prepare_queue, + prepare::ToQueue::Enqueue { + priority: Priority::Background, + pvf: active_pvf, + }, + ) + .await?; + } + } + + Ok(()) +} + +async fn handle_prepare_done( + cache_path: &Path, + artifacts: &mut Artifacts, + execute_queue: &mut mpsc::Sender, + awaiting_prepare: &mut AwaitingPrepare, + artifact_id: ArtifactId, +) -> Result<(), Fatal> { + // Make some sanity checks and extract the current state. + let state = match artifacts.artifact_state_mut(&artifact_id) { + None => { + // before sending request to prepare, the artifact is inserted with `preparing` state; + // the requests are deduplicated for the same artifact id; + // there is only one possible state change: prepare is done; + // thus the artifact cannot be unknown, only preparing; + // qed. + never!("an unknown artifact was prepared: {:?}", artifact_id); + return Ok(()); + } + Some(ArtifactState::Prepared { .. }) => { + // before sending request to prepare, the artifact is inserted with `preparing` state; + // the requests are deduplicated for the same artifact id; + // there is only one possible state change: prepare is done; + // thus the artifact cannot be prepared, only preparing; + // qed. + never!("the artifact is already prepared: {:?}", artifact_id); + return Ok(()); + } + Some(state @ ArtifactState::Preparing) => state, + }; + + // It's finally time to dispatch all the execution requests that were waiting for this artifact + // to be prepared. + let artifact_path = artifact_id.path(&cache_path); + let pending_requests = awaiting_prepare.take(&artifact_id); + for PendingExecutionRequest { params, result_tx } in pending_requests { + if result_tx.is_canceled() { + // Preparation could've taken quite a bit of time and the requester may be not interested + // in execution anymore, in which case we just skip the request. + continue; + } + + send_execute( + execute_queue, + execute::ToQueue::Enqueue { + artifact_path: artifact_path.clone(), + params, + result_tx, + }, + ) + .await?; + } + + // Now consider the artifact prepared. + *state = ArtifactState::Prepared { + last_time_needed: SystemTime::now(), + }; + + Ok(()) +} + +async fn send_prepare( + prepare_queue: &mut mpsc::Sender, + to_queue: prepare::ToQueue, +) -> Result<(), Fatal> { + prepare_queue.send(to_queue).await.map_err(|_| Fatal) +} + +async fn send_execute( + execute_queue: &mut mpsc::Sender, + to_queue: execute::ToQueue, +) -> Result<(), Fatal> { + execute_queue.send(to_queue).await.map_err(|_| Fatal) +} + +async fn handle_cleanup_pulse( + cache_path: &Path, + sweeper_tx: &mut mpsc::Sender, + artifacts: &mut Artifacts, + artifact_ttl: Duration, +) -> Result<(), Fatal> { + let to_remove = artifacts.prune(artifact_ttl); + for artifact_id in to_remove { + let artifact_path = artifact_id.path(cache_path); + sweeper_tx.send(artifact_path).await.map_err(|_| Fatal)?; + } + + Ok(()) +} + +/// A simple task which sole purpose is to delete files thrown at it. +async fn sweeper_task(mut sweeper_rx: mpsc::Receiver) { + loop { + match sweeper_rx.next().await { + None => break, + Some(condemned) => { + let _ = async_std::fs::remove_file(condemned).await; + } + } + } +} + +/// A stream that yields a pulse continuously at a given interval. +fn pulse_every(interval: std::time::Duration) -> impl futures::Stream { + futures::stream::unfold(interval, { + |interval| async move { + futures_timer::Delay::new(interval).await; + Some(((), interval)) + } + }) + .map(|_| ()) +} + +#[cfg(test)] +mod tests { + use super::*; + use futures::future::BoxFuture; + use assert_matches::assert_matches; + + #[async_std::test] + async fn pulse_test() { + let pulse = pulse_every(Duration::from_millis(100)); + futures::pin_mut!(pulse); + + for _ in 0usize..5usize { + let start = std::time::Instant::now(); + let _ = pulse.next().await.unwrap(); + + let el = start.elapsed().as_millis(); + assert!(el > 50 && el < 150, "{}", el); + } + } + + /// Creates a new pvf which artifact id can be uniquely identified by the given number. + fn artifact_id(descriminator: u32) -> ArtifactId { + Pvf::from_discriminator(descriminator).as_artifact_id() + } + + fn artifact_path(descriminator: u32) -> PathBuf { + artifact_id(descriminator) + .path(&PathBuf::from(std::env::temp_dir())) + .to_owned() + } + + struct Builder { + cleanup_pulse_interval: Duration, + artifact_ttl: Duration, + artifacts: Artifacts, + } + + impl Builder { + fn default() -> Self { + Self { + // these are selected high to not interfere in tests in which pruning is irrelevant. + cleanup_pulse_interval: Duration::from_secs(3600), + artifact_ttl: Duration::from_secs(3600), + + artifacts: Artifacts::empty(), + } + } + + fn build(self) -> Test { + Test::new(self) + } + } + + struct Test { + to_host_tx: Option>, + + to_prepare_queue_rx: mpsc::Receiver, + from_prepare_queue_tx: mpsc::UnboundedSender, + to_execute_queue_rx: mpsc::Receiver, + to_sweeper_rx: mpsc::Receiver, + + run: BoxFuture<'static, ()>, + } + + impl Test { + fn new( + Builder { + cleanup_pulse_interval, + artifact_ttl, + artifacts, + }: Builder, + ) -> Self { + let cache_path = PathBuf::from(std::env::temp_dir()); + + let (to_host_tx, to_host_rx) = mpsc::channel(10); + let (to_prepare_queue_tx, to_prepare_queue_rx) = mpsc::channel(10); + let (from_prepare_queue_tx, from_prepare_queue_rx) = mpsc::unbounded(); + let (to_execute_queue_tx, to_execute_queue_rx) = mpsc::channel(10); + let (to_sweeper_tx, to_sweeper_rx) = mpsc::channel(10); + + let mk_dummy_loop = || std::future::pending().boxed(); + + let run = run( + Inner { + cache_path, + cleanup_pulse_interval, + artifact_ttl, + artifacts, + to_host_rx, + to_prepare_queue_tx, + from_prepare_queue_rx, + to_execute_queue_tx, + to_sweeper_tx, + awaiting_prepare: AwaitingPrepare::default(), + }, + mk_dummy_loop(), + mk_dummy_loop(), + mk_dummy_loop(), + mk_dummy_loop(), + ) + .boxed(); + + Self { + to_host_tx: Some(to_host_tx), + to_prepare_queue_rx, + from_prepare_queue_tx, + to_execute_queue_rx, + to_sweeper_rx, + run, + } + } + + fn host_handle(&mut self) -> ValidationHost { + let to_host_tx = self.to_host_tx.take().unwrap(); + ValidationHost { to_host_tx } + } + + async fn poll_and_recv_to_prepare_queue(&mut self) -> prepare::ToQueue { + let to_prepare_queue_rx = &mut self.to_prepare_queue_rx; + run_until( + &mut self.run, + async { to_prepare_queue_rx.next().await.unwrap() }.boxed(), + ) + .await + } + + async fn poll_and_recv_to_execute_queue(&mut self) -> execute::ToQueue { + let to_execute_queue_rx = &mut self.to_execute_queue_rx; + run_until( + &mut self.run, + async { to_execute_queue_rx.next().await.unwrap() }.boxed(), + ) + .await + } + + async fn poll_ensure_to_execute_queue_is_empty(&mut self) { + use futures_timer::Delay; + + let to_execute_queue_rx = &mut self.to_execute_queue_rx; + run_until( + &mut self.run, + async { + futures::select! { + _ = Delay::new(Duration::from_millis(500)).fuse() => (), + _ = to_execute_queue_rx.next().fuse() => { + panic!("the execute queue supposed to be empty") + } + } + } + .boxed(), + ) + .await + } + + async fn poll_ensure_to_sweeper_is_empty(&mut self) { + use futures_timer::Delay; + + let to_sweeper_rx = &mut self.to_sweeper_rx; + run_until( + &mut self.run, + async { + futures::select! { + _ = Delay::new(Duration::from_millis(500)).fuse() => (), + msg = to_sweeper_rx.next().fuse() => { + panic!("the sweeper supposed to be empty, but received: {:?}", msg) + } + } + } + .boxed(), + ) + .await + } + } + + async fn run_until( + task: &mut (impl Future + Unpin), + mut fut: (impl Future + Unpin), + ) -> R { + use std::task::Poll; + + let start = std::time::Instant::now(); + let fut = &mut fut; + loop { + if start.elapsed() > std::time::Duration::from_secs(2) { + // We expect that this will take only a couple of iterations and thus to take way + // less than a second. + panic!("timeout"); + } + + if let Poll::Ready(r) = futures::poll!(&mut *fut) { + break r; + } + + if futures::poll!(&mut *task).is_ready() { + panic!() + } + } + } + + #[async_std::test] + async fn shutdown_on_handle_drop() { + let test = Builder::default().build(); + + let join_handle = async_std::task::spawn(test.run); + + // Dropping the handle will lead to conclusion of the read part and thus will make the event + // loop to stop, which in turn will resolve the join handle. + drop(test.to_host_tx); + join_handle.await; + } + + #[async_std::test] + async fn pruning() { + let mock_now = SystemTime::now() - Duration::from_millis(1000); + + let mut builder = Builder::default(); + builder.cleanup_pulse_interval = Duration::from_millis(100); + builder.artifact_ttl = Duration::from_millis(500); + builder.artifacts.insert_prepared(artifact_id(1), mock_now); + builder.artifacts.insert_prepared(artifact_id(2), mock_now); + let mut test = builder.build(); + let mut host = test.host_handle(); + + host.heads_up(vec![Pvf::from_discriminator(1)]) + .await + .unwrap(); + + let to_sweeper_rx = &mut test.to_sweeper_rx; + run_until( + &mut test.run, + async { + assert_eq!(to_sweeper_rx.next().await.unwrap(), artifact_path(2)); + } + .boxed(), + ) + .await; + + // Extend TTL for the first artifact and make sure we don't receive another file removal + // request. + host.heads_up(vec![Pvf::from_discriminator(1)]) + .await + .unwrap(); + test.poll_ensure_to_sweeper_is_empty().await; + } + + #[async_std::test] + async fn amending_priority() { + let mut test = Builder::default().build(); + let mut host = test.host_handle(); + + host.heads_up(vec![Pvf::from_discriminator(1)]) + .await + .unwrap(); + + // Run until we receive a prepare request. + let prepare_q_rx = &mut test.to_prepare_queue_rx; + run_until( + &mut test.run, + async { + assert_matches!( + prepare_q_rx.next().await.unwrap(), + prepare::ToQueue::Enqueue { .. } + ); + } + .boxed(), + ) + .await; + + let (result_tx, _result_rx) = oneshot::channel(); + host.execute_pvf( + Pvf::from_discriminator(1), + vec![], + Priority::Critical, + result_tx, + ) + .await + .unwrap(); + + run_until( + &mut test.run, + async { + assert_matches!( + prepare_q_rx.next().await.unwrap(), + prepare::ToQueue::Amend { .. } + ); + } + .boxed(), + ) + .await; + } + + #[async_std::test] + async fn execute_pvf_requests() { + use crate::error::InvalidCandidate; + + let mut test = Builder::default().build(); + let mut host = test.host_handle(); + + let (result_tx, result_rx_pvf_1_1) = oneshot::channel(); + host.execute_pvf( + Pvf::from_discriminator(1), + b"pvf1".to_vec(), + Priority::Normal, + result_tx, + ) + .await + .unwrap(); + + let (result_tx, result_rx_pvf_1_2) = oneshot::channel(); + host.execute_pvf( + Pvf::from_discriminator(1), + b"pvf1".to_vec(), + Priority::Critical, + result_tx, + ) + .await + .unwrap(); + + let (result_tx, result_rx_pvf_2) = oneshot::channel(); + host.execute_pvf( + Pvf::from_discriminator(2), + b"pvf2".to_vec(), + Priority::Normal, + result_tx, + ) + .await + .unwrap(); + + assert_matches!( + test.poll_and_recv_to_prepare_queue().await, + prepare::ToQueue::Enqueue { .. } + ); + assert_matches!( + test.poll_and_recv_to_prepare_queue().await, + prepare::ToQueue::Amend { .. } + ); + assert_matches!( + test.poll_and_recv_to_prepare_queue().await, + prepare::ToQueue::Enqueue { .. } + ); + + test.from_prepare_queue_tx + .send(prepare::FromQueue::Prepared(artifact_id(1))) + .await + .unwrap(); + let result_tx_pvf_1_1 = assert_matches!( + test.poll_and_recv_to_execute_queue().await, + execute::ToQueue::Enqueue { result_tx, .. } => result_tx + ); + let result_tx_pvf_1_2 = assert_matches!( + test.poll_and_recv_to_execute_queue().await, + execute::ToQueue::Enqueue { result_tx, .. } => result_tx + ); + + test.from_prepare_queue_tx + .send(prepare::FromQueue::Prepared(artifact_id(2))) + .await + .unwrap(); + let result_tx_pvf_2 = assert_matches!( + test.poll_and_recv_to_execute_queue().await, + execute::ToQueue::Enqueue { result_tx, .. } => result_tx + ); + + result_tx_pvf_1_1 + .send(Err(ValidationError::InvalidCandidate( + InvalidCandidate::AmbigiousWorkerDeath, + ))) + .unwrap(); + assert_matches!( + result_rx_pvf_1_1.now_or_never().unwrap().unwrap(), + Err(ValidationError::InvalidCandidate( + InvalidCandidate::AmbigiousWorkerDeath, + )) + ); + + result_tx_pvf_1_2 + .send(Err(ValidationError::InvalidCandidate( + InvalidCandidate::AmbigiousWorkerDeath, + ))) + .unwrap(); + assert_matches!( + result_rx_pvf_1_2.now_or_never().unwrap().unwrap(), + Err(ValidationError::InvalidCandidate( + InvalidCandidate::AmbigiousWorkerDeath, + )) + ); + + result_tx_pvf_2 + .send(Err(ValidationError::InvalidCandidate( + InvalidCandidate::AmbigiousWorkerDeath, + ))) + .unwrap(); + assert_matches!( + result_rx_pvf_2.now_or_never().unwrap().unwrap(), + Err(ValidationError::InvalidCandidate( + InvalidCandidate::AmbigiousWorkerDeath, + )) + ); + } + + #[async_std::test] + async fn cancellation() { + let mut test = Builder::default().build(); + let mut host = test.host_handle(); + + let (result_tx, result_rx) = oneshot::channel(); + host.execute_pvf( + Pvf::from_discriminator(1), + b"pvf1".to_vec(), + Priority::Normal, + result_tx, + ) + .await + .unwrap(); + + assert_matches!( + test.poll_and_recv_to_prepare_queue().await, + prepare::ToQueue::Enqueue { .. } + ); + + test.from_prepare_queue_tx + .send(prepare::FromQueue::Prepared(artifact_id(1))) + .await + .unwrap(); + + drop(result_rx); + + test.poll_ensure_to_execute_queue_is_empty().await; + } +} diff --git a/node/core/pvf/src/lib.rs b/node/core/pvf/src/lib.rs new file mode 100644 index 000000000000..d29887edba2b --- /dev/null +++ b/node/core/pvf/src/lib.rs @@ -0,0 +1,100 @@ +// Copyright 2021 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +#![warn(missing_docs)] + +//! A crate that implements PVF validation host. +//! +//! This crate provides a simple API. You first [`start`] the validation host, which gives you the +//! [handle][`ValidationHost`] and the future you need to poll. +//! +//! Then using the handle the client can send two types of requests: +//! +//! (a) PVF execution. This accepts the PVF [params][`polkadot_parachain::primitives::ValidationParams`] +//! and the PVF [code][`Pvf`], prepares (verifies and compiles) the code, and then executes PVF +//! with the params. +//! +//! (b) Heads up. This request allows to signal that the given PVF may be needed soon and that it +//! should be prepared for execution. +//! +//! The preparation results are cached for some time after they either used or was signalled in heads up. +//! All requests that depends on preparation of the same PVF are bundled together and will be executed +//! as soon as the artifact is prepared. +//! +//! # Priority +//! +//! PVF execution requests can specify the [priority][`Priority`] with which the given request should +//! be handled. Different priority levels have different effects. This is discussed below. +//! +//! Preparation started by a heads up signal always starts in with the background priority. If there +//! is already a request for that PVF preparation under way the priority is inherited. If after heads +//! up, a new PVF execution request comes in with a higher priority, then the original task's priority +//! will be adjusted to match the new one if it's larger. +//! +//! Priority can never go down, only up. +//! +//! # Under the hood +//! +//! Under the hood, the validation host is built using a bunch of communicating processes, not +//! dissimilar to actors. Each of such "processes" is a future task that contains an event loop that +//! processes incoming messages, potentially delegating sub-tasks to other "processes". +//! +//! Two of these processes are queues. The first one is for preparation jobs and the second one is for +//! execution. Both of the queues are backed by separate pools of workers of different kind. +//! +//! Preparation workers handle preparation requests by preverifying and instrumenting PVF wasm code, +//! and then passing it into the compiler, to prepare the artifact. +//! +//! Artifact is a final product of preparation. If the preparation succeeded, then the artifact will +//! contain the compiled code usable for quick execution by a worker later on. +//! +//! If the preparation failed, then the worker will still write the artifact with the error message. +//! We save the artifact with the error so that we don't try to prepare the artifacts that are broken +//! repeatedly. +//! +//! The artifact is saved on disk and is also tracked by an in memory table. This in memory table +//! doesn't contain the artifact contents though, only a flag that the given artifact is compiled. +//! +//! The execute workers will be fed by the requests from the execution queue, which is basically a +//! combination of a path to the compiled artifact and the +//! [params][`polkadot_parachain::primitives::ValidationParams`]. +//! +//! Each fixed interval of time a pruning task will run. This task will remove all artifacts that +//! weren't used or received a heads up signal for a while. + +mod artifacts; +mod error; +mod execute; +mod executor_intf; +mod host; +mod prepare; +mod priority; +mod pvf; +mod worker_common; + +#[doc(hidden)] +pub mod testing; + +pub use error::{ValidationError, InvalidCandidate}; +pub use priority::Priority; +pub use pvf::Pvf; + +pub use host::{start, Config, ValidationHost}; + +pub use execute::worker_entrypoint as execute_worker_entrypoint; +pub use prepare::worker_entrypoint as prepare_worker_entrypoint; + +const LOG_TARGET: &str = "parachain::pvf"; diff --git a/node/core/pvf/src/prepare/mod.rs b/node/core/pvf/src/prepare/mod.rs new file mode 100644 index 000000000000..080dd069e29d --- /dev/null +++ b/node/core/pvf/src/prepare/mod.rs @@ -0,0 +1,31 @@ +// Copyright 2021 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +//! Preparation part of pipeline +//! +//! The validation host spins up two processes: the queue (by running [`start_queue`]) and the pool +//! (by running [`start_pool`]). +//! +//! The pool will spawn workers in new processes and those should execute pass control to +//! [`worker_entrypoint`]. + +mod pool; +mod queue; +mod worker; + +pub use queue::{ToQueue, FromQueue, start as start_queue}; +pub use pool::start as start_pool; +pub use worker::worker_entrypoint; diff --git a/node/core/pvf/src/prepare/pool.rs b/node/core/pvf/src/prepare/pool.rs new file mode 100644 index 000000000000..1e3a4eb8ee7d --- /dev/null +++ b/node/core/pvf/src/prepare/pool.rs @@ -0,0 +1,336 @@ +// Copyright 2021 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +use crate::{ + worker_common::{IdleWorker, WorkerHandle}, + LOG_TARGET, +}; +use super::{ + worker::{self, Outcome}, +}; +use std::{fmt, sync::Arc, task::Poll, time::Duration}; +use async_std::path::{Path, PathBuf}; +use futures::{ + Future, FutureExt, StreamExt, channel::mpsc, future::BoxFuture, stream::FuturesUnordered, +}; +use slotmap::HopSlotMap; +use assert_matches::assert_matches; +use always_assert::never; + +slotmap::new_key_type! { pub struct Worker; } + +/// Messages that the pool handles. +#[derive(Debug, PartialEq, Eq)] +pub enum ToPool { + /// Request a new worker to spawn. + /// + /// This request won't fail in case if the worker cannot be created. Instead, we consider + /// the failures transient and we try to spawn a worker after a delay. + /// + /// [`FromPool::Spawned`] will be returned as soon as the worker is spawned. + /// + /// The client should anticipate a [`FromPool::Rip`] message, in case the spawned worker was + /// stopped for some reason. + Spawn, + + /// Kill the given worker. No-op if the given worker is not running. + /// + /// [`FromPool::Rip`] won't be sent in this case. However, the client should be prepared to + /// receive [`FromPool::Rip`] nonetheless, since the worker may be have been ripped before + /// this message is processed. + Kill(Worker), + + /// If the given worker was started with the background priority, then it will be raised up to + /// normal priority. Otherwise, it's no-op. + BumpPriority(Worker), + + /// Request the given worker to start working on the given code. + /// + /// Once the job either succeeded or failed, a [`FromPool::Concluded`] message will be sent back. + /// + /// This should not be sent again until the concluded message is received. + StartWork { + worker: Worker, + code: Arc>, + artifact_path: PathBuf, + background_priority: bool, + }, +} + +/// A message sent from pool to its client. +#[derive(Debug)] +pub enum FromPool { + /// The given worker was just spawned and is ready to be used. + Spawned(Worker), + + /// The given worker either succeeded or failed the given job. Under any circumstances the + /// artifact file has been written. The bool says whether the worker ripped. + Concluded(Worker, bool), + + /// The given worker ceased to exist. + Rip(Worker), +} + +struct WorkerData { + idle: Option, + handle: WorkerHandle, +} + +impl fmt::Debug for WorkerData { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "WorkerData(pid={})", self.handle.id()) + } +} + +enum PoolEvent { + Spawn(IdleWorker, WorkerHandle), + StartWork(Worker, Outcome), +} + +type Mux = FuturesUnordered>; + +struct Pool { + program_path: PathBuf, + spawn_timeout: Duration, + to_pool: mpsc::Receiver, + from_pool: mpsc::UnboundedSender, + spawned: HopSlotMap, + mux: Mux, +} + +/// A fatal error that warrants stopping the event loop of the pool. +struct Fatal; + +async fn run( + Pool { + program_path, + spawn_timeout, + to_pool, + mut from_pool, + mut spawned, + mut mux, + }: Pool, +) { + macro_rules! break_if_fatal { + ($expr:expr) => { + match $expr { + Err(Fatal) => break, + Ok(v) => v, + } + }; + } + + let mut to_pool = to_pool.fuse(); + + loop { + futures::select! { + to_pool = to_pool.next() => { + let to_pool = break_if_fatal!(to_pool.ok_or(Fatal)); + handle_to_pool( + &program_path, + spawn_timeout, + &mut spawned, + &mut mux, + to_pool, + ) + } + ev = mux.select_next_some() => break_if_fatal!(handle_mux(&mut from_pool, &mut spawned, ev)), + } + + break_if_fatal!(purge_dead(&mut from_pool, &mut spawned).await); + } +} + +async fn purge_dead( + from_pool: &mut mpsc::UnboundedSender, + spawned: &mut HopSlotMap, +) -> Result<(), Fatal> { + let mut to_remove = vec![]; + for (worker, data) in spawned.iter_mut() { + if data.idle.is_none() { + // The idle token is missing, meaning this worker is now occupied: skip it. This is + // because the worker process is observed by the work task and should it reach the + // deadline or be terminated it will be handled by the corresponding mux event. + continue; + } + + if let Poll::Ready(()) = futures::poll!(&mut data.handle) { + // a resolved future means that the worker has terminated. Weed it out. + to_remove.push(worker); + } + } + for w in to_remove { + let _ = spawned.remove(w); + reply(from_pool, FromPool::Rip(w))?; + } + Ok(()) +} + +fn handle_to_pool( + program_path: &Path, + spawn_timeout: Duration, + spawned: &mut HopSlotMap, + mux: &mut Mux, + to_pool: ToPool, +) { + match to_pool { + ToPool::Spawn => { + mux.push(spawn_worker_task(program_path.to_owned(), spawn_timeout).boxed()); + } + ToPool::StartWork { + worker, + code, + artifact_path, + background_priority, + } => { + if let Some(data) = spawned.get_mut(worker) { + if let Some(idle) = data.idle.take() { + mux.push( + start_work_task(worker, idle, code, artifact_path, background_priority) + .boxed(), + ); + } else { + // idle token is present after spawn and after a job is concluded; + // the precondition for `StartWork` is it should be sent only if all previous work + // items concluded; + // thus idle token is Some; + // qed. + never!("unexpected abscence of the idle token in prepare pool"); + } + } else { + // That's a relatively normal situation since the queue may send `start_work` and + // before receiving it the pool would report that the worker died. + } + } + ToPool::Kill(worker) => { + // It may be absent if it were previously already removed by `purge_dead`. + let _ = spawned.remove(worker); + } + ToPool::BumpPriority(worker) => { + if let Some(data) = spawned.get(worker) { + worker::bump_priority(&data.handle); + } + } + } +} + +async fn spawn_worker_task(program_path: PathBuf, spawn_timeout: Duration) -> PoolEvent { + use futures_timer::Delay; + + loop { + match worker::spawn(&program_path, spawn_timeout).await { + Ok((idle, handle)) => break PoolEvent::Spawn(idle, handle), + Err(err) => { + tracing::warn!( + target: LOG_TARGET, + "failed to spawn a prepare worker: {:?}", + err, + ); + + // Assume that the failure intermittent and retry after a delay. + Delay::new(Duration::from_secs(3)).await; + } + } + } +} + +async fn start_work_task( + worker: Worker, + idle: IdleWorker, + code: Arc>, + artifact_path: PathBuf, + background_priority: bool, +) -> PoolEvent { + let outcome = worker::start_work(idle, code, artifact_path, background_priority).await; + PoolEvent::StartWork(worker, outcome) +} + +fn handle_mux( + from_pool: &mut mpsc::UnboundedSender, + spawned: &mut HopSlotMap, + event: PoolEvent, +) -> Result<(), Fatal> { + match event { + PoolEvent::Spawn(idle, handle) => { + let worker = spawned.insert(WorkerData { + idle: Some(idle), + handle, + }); + + reply(from_pool, FromPool::Spawned(worker))?; + + Ok(()) + } + PoolEvent::StartWork(worker, outcome) => { + match outcome { + Outcome::Concluded(idle) => { + let data = match spawned.get_mut(worker) { + None => { + // Perhaps the worker was killed meanwhile and the result is no longer + // relevant. + return Ok(()); + } + Some(data) => data, + }; + + // We just replace the idle worker that was loaned from this option during + // the work starting. + let old = data.idle.replace(idle); + assert_matches!(old, None, "attempt to overwrite an idle worker"); + + reply(from_pool, FromPool::Concluded(worker, false))?; + + Ok(()) + } + Outcome::DidntMakeIt => { + if let Some(_data) = spawned.remove(worker) { + reply(from_pool, FromPool::Concluded(worker, true))?; + } + + Ok(()) + } + } + } + } +} + +fn reply(from_pool: &mut mpsc::UnboundedSender, m: FromPool) -> Result<(), Fatal> { + from_pool.unbounded_send(m).map_err(|_| Fatal) +} + +/// Spins up the pool and returns the future that should be polled to make the pool functional. +pub fn start( + program_path: PathBuf, + spawn_timeout: Duration, +) -> ( + mpsc::Sender, + mpsc::UnboundedReceiver, + impl Future, +) { + let (to_pool_tx, to_pool_rx) = mpsc::channel(10); + let (from_pool_tx, from_pool_rx) = mpsc::unbounded(); + + let run = run(Pool { + program_path, + spawn_timeout, + to_pool: to_pool_rx, + from_pool: from_pool_tx, + spawned: HopSlotMap::with_capacity_and_key(20), + mux: Mux::new(), + }); + + (to_pool_tx, from_pool_rx, run) +} diff --git a/node/core/pvf/src/prepare/queue.rs b/node/core/pvf/src/prepare/queue.rs new file mode 100644 index 000000000000..b81a47ee9918 --- /dev/null +++ b/node/core/pvf/src/prepare/queue.rs @@ -0,0 +1,894 @@ +// Copyright 2021 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +//! A queue that handles requests for PVF preparation. + +use super::{ + pool::{self, Worker}, +}; +use crate::{LOG_TARGET, Priority, Pvf, artifacts::ArtifactId}; +use futures::{Future, SinkExt, channel::mpsc, stream::StreamExt as _}; +use std::collections::{HashMap, VecDeque}; +use async_std::path::PathBuf; +use always_assert::{always, never}; + +/// A request to pool. +#[derive(Debug)] +pub enum ToQueue { + /// This schedules preparation of the given PVF. + /// + /// Note that it is incorrect to enqueue the same PVF again without first receiving the + /// [`FromQueue::Prepared`] response. In case there is a need to bump the priority, use + /// [`ToQueue::Amend`]. + Enqueue { priority: Priority, pvf: Pvf }, + /// Amends the priority for the given [`ArtifactId`] if it is running. If it's not, then it's noop. + Amend { + priority: Priority, + artifact_id: ArtifactId, + }, +} + +/// A response from queue. +#[derive(Debug, PartialEq, Eq)] +pub enum FromQueue { + Prepared(ArtifactId), +} + +#[derive(Default)] +struct Limits { + /// The maximum number of workers this pool can ever host. This is expected to be a small + /// number, e.g. within a dozen. + hard_capacity: usize, + + /// The number of workers we want aim to have. If there is a critical job and we are already + /// at `soft_capacity`, we are allowed to grow up to `hard_capacity`. Thus this should be equal + /// or smaller than `hard_capacity`. + soft_capacity: usize, +} + +impl Limits { + /// Returns `true` if the queue is allowed to request one more worker. + fn can_afford_one_more(&self, spawned_num: usize, critical: bool) -> bool { + let cap = if critical { + self.hard_capacity + } else { + self.soft_capacity + }; + spawned_num < cap + } + + /// Offer the worker back to the pool. The passed worker ID must be considered unusable unless + /// it wasn't taken by the pool, in which case it will be returned as `Some`. + fn should_cull(&mut self, spawned_num: usize) -> bool { + spawned_num > self.soft_capacity + } +} + +slotmap::new_key_type! { pub struct Job; } + +struct JobData { + /// The priority of this job. Can be bumped. + priority: Priority, + pvf: Pvf, + worker: Option, +} + +#[derive(Default)] +struct WorkerData { + job: Option, +} + +impl WorkerData { + fn is_idle(&self) -> bool { + self.job.is_none() + } +} + +/// A queue structured like this is prone to starving, however, we don't care that much since we expect +/// there is going to be a limited number of critical jobs and we don't really care if background starve. +#[derive(Default)] +struct Unscheduled { + background: VecDeque, + normal: VecDeque, + critical: VecDeque, +} + +impl Unscheduled { + fn queue_mut(&mut self, prio: Priority) -> &mut VecDeque { + match prio { + Priority::Background => &mut self.background, + Priority::Normal => &mut self.normal, + Priority::Critical => &mut self.critical, + } + } + + fn add(&mut self, prio: Priority, job: Job) { + self.queue_mut(prio).push_back(job); + } + + fn readd(&mut self, prio: Priority, job: Job) { + self.queue_mut(prio).push_front(job); + } + + fn is_empty(&self) -> bool { + self.background.is_empty() && self.normal.is_empty() && self.critical.is_empty() + } + + fn next(&mut self) -> Option { + let mut check = |prio: Priority| self.queue_mut(prio).pop_front(); + check(Priority::Critical) + .or_else(|| check(Priority::Normal)) + .or_else(|| check(Priority::Background)) + } +} + +struct Queue { + to_queue_rx: mpsc::Receiver, + from_queue_tx: mpsc::UnboundedSender, + + to_pool_tx: mpsc::Sender, + from_pool_rx: mpsc::UnboundedReceiver, + + cache_path: PathBuf, + limits: Limits, + + jobs: slotmap::SlotMap, + + /// A mapping from artifact id to a job. + artifact_id_to_job: HashMap, + /// The registry of all workers. + workers: slotmap::SparseSecondaryMap, + /// The number of workers requested to spawn but not yet spawned. + spawn_inflight: usize, + + /// The jobs that are not yet scheduled. These are waiting until the next `poll` where they are + /// processed all at once. + unscheduled: Unscheduled, +} + +/// A fatal error that warrants stopping the queue. +struct Fatal; + +impl Queue { + fn new( + soft_capacity: usize, + hard_capacity: usize, + cache_path: PathBuf, + to_queue_rx: mpsc::Receiver, + from_queue_tx: mpsc::UnboundedSender, + to_pool_tx: mpsc::Sender, + from_pool_rx: mpsc::UnboundedReceiver, + ) -> Self { + Self { + to_queue_rx, + from_queue_tx, + to_pool_tx, + from_pool_rx, + cache_path, + spawn_inflight: 0, + limits: Limits { + hard_capacity, + soft_capacity, + }, + jobs: slotmap::SlotMap::with_key(), + unscheduled: Unscheduled::default(), + artifact_id_to_job: HashMap::new(), + workers: slotmap::SparseSecondaryMap::new(), + } + } + + async fn run(mut self) { + macro_rules! break_if_fatal { + ($expr:expr) => { + if let Err(Fatal) = $expr { + break; + } + }; + } + + loop { + // biased to make it behave deterministically for tests. + futures::select_biased! { + to_queue = self.to_queue_rx.select_next_some() => + break_if_fatal!(handle_to_queue(&mut self, to_queue).await), + from_pool = self.from_pool_rx.select_next_some() => + break_if_fatal!(handle_from_pool(&mut self, from_pool).await), + } + } + } +} + +async fn handle_to_queue(queue: &mut Queue, to_queue: ToQueue) -> Result<(), Fatal> { + match to_queue { + ToQueue::Enqueue { priority, pvf } => { + handle_enqueue(queue, priority, pvf).await?; + } + ToQueue::Amend { + priority, + artifact_id, + } => { + handle_amend(queue, priority, artifact_id).await?; + } + } + Ok(()) +} + +async fn handle_enqueue(queue: &mut Queue, priority: Priority, pvf: Pvf) -> Result<(), Fatal> { + let artifact_id = pvf.as_artifact_id(); + if never!( + queue.artifact_id_to_job.contains_key(&artifact_id), + "second Enqueue sent for a known artifact" + ) { + // This function is called in response to a `Enqueue` message; + // Precondtion for `Enqueue` is that it is sent only once for a PVF; + // Thus this should always be `false`; + // qed. + tracing::warn!( + target: LOG_TARGET, + "duplicate `enqueue` command received for {:?}", + artifact_id, + ); + return Ok(()); + } + + let job = queue.jobs.insert(JobData { + priority, + pvf, + worker: None, + }); + queue.artifact_id_to_job.insert(artifact_id, job); + + if let Some(available) = find_idle_worker(queue) { + // This may seem not fair (w.r.t priority) on the first glance, but it should be. This is + // because as soon as a worker finishes with the job it's immediatelly given the next one. + assign(queue, available, job).await?; + } else { + spawn_extra_worker(queue, priority.is_critical()).await?; + queue.unscheduled.add(priority, job); + } + + Ok(()) +} + +fn find_idle_worker(queue: &mut Queue) -> Option { + queue + .workers + .iter() + .filter(|(_, data)| data.is_idle()) + .map(|(k, _)| k) + .next() +} + +async fn handle_amend( + queue: &mut Queue, + priority: Priority, + artifact_id: ArtifactId, +) -> Result<(), Fatal> { + if let Some(&job) = queue.artifact_id_to_job.get(&artifact_id) { + let mut job_data: &mut JobData = &mut queue.jobs[job]; + + if job_data.priority < priority { + // The new priority is higher. We should do two things: + // - if the worker was already spawned with the background prio and the new one is not + // (it's already the case, if we are in this branch but we still do the check for + // clarity), then we should tell the pool to bump the priority for the worker. + // + // - save the new priority in the job. + + if let Some(worker) = job_data.worker { + if job_data.priority.is_background() && !priority.is_background() { + send_pool(&mut queue.to_pool_tx, pool::ToPool::BumpPriority(worker)).await?; + } + } + + job_data.priority = priority; + } + } + + Ok(()) +} + +async fn handle_from_pool(queue: &mut Queue, from_pool: pool::FromPool) -> Result<(), Fatal> { + use pool::FromPool::*; + match from_pool { + Spawned(worker) => handle_worker_spawned(queue, worker).await?, + Concluded(worker, rip) => handle_worker_concluded(queue, worker, rip).await?, + Rip(worker) => handle_worker_rip(queue, worker).await?, + } + Ok(()) +} + +async fn handle_worker_spawned(queue: &mut Queue, worker: Worker) -> Result<(), Fatal> { + queue.workers.insert(worker, WorkerData::default()); + queue.spawn_inflight -= 1; + + if let Some(job) = queue.unscheduled.next() { + assign(queue, worker, job).await?; + } + + Ok(()) +} + +async fn handle_worker_concluded( + queue: &mut Queue, + worker: Worker, + rip: bool, +) -> Result<(), Fatal> { + macro_rules! never_none { + ($expr:expr) => { + match $expr { + Some(v) => v, + None => { + // Precondition of calling this is that the $expr is never none; + // Assume the conditions holds, then this never is not hit; + // qed. + never!("never_none, {}", stringify!($expr)); + return Ok(()); + } + } + }; + } + + // Find out on which artifact was the worker working. + + // workers are registered upon spawn and removed in one of the following cases: + // 1. received rip signal + // 2. received concluded signal with rip=true; + // concluded signal only comes from a spawned worker and only once; + // rip signal is not sent after conclusion with rip=true; + // the worker should be registered; + // this can't be None; + // qed. + let worker_data = never_none!(queue.workers.get_mut(worker)); + + // worker_data.job is set only by `assign` and removed only here for a worker; + // concluded signal only comes for a worker that was previously assigned and only once; + // the worker should have the job; + // this can't be None; + // qed. + let job = never_none!(worker_data.job.take()); + + // job_data is inserted upon enqueue and removed only here; + // as was established above, this worker was previously `assign`ed to the job; + // that implies that the job was enqueued; + // conclude signal only comes once; + // we are just to remove the job for the first and the only time; + // this can't be None; + // qed. + let job_data = never_none!(queue.jobs.remove(job)); + let artifact_id = job_data.pvf.as_artifact_id(); + + queue.artifact_id_to_job.remove(&artifact_id); + + reply(&mut queue.from_queue_tx, FromQueue::Prepared(artifact_id))?; + + // Figure out what to do with the worker. + if rip { + let worker_data = queue.workers.remove(worker); + // worker should exist, it's asserted above; + // qed. + always!(worker_data.is_some()); + + if !queue.unscheduled.is_empty() { + // That is unconditionally not critical just to not accidentally fill up + // the pool up to the hard cap. + spawn_extra_worker(queue, false).await?; + } + } else { + if queue + .limits + .should_cull(queue.workers.len() + queue.spawn_inflight) + { + // We no longer need services of this worker. Kill it. + queue.workers.remove(worker); + send_pool(&mut queue.to_pool_tx, pool::ToPool::Kill(worker)).await?; + } else { + // see if there are more work available and schedule it. + if let Some(job) = queue.unscheduled.next() { + assign(queue, worker, job).await?; + } + } + } + + Ok(()) +} + +async fn handle_worker_rip(queue: &mut Queue, worker: Worker) -> Result<(), Fatal> { + let worker_data = queue.workers.remove(worker); + + if let Some(WorkerData { job: Some(job), .. }) = worker_data { + // This is an edge case where the worker ripped after we sent assignment but before it + // was received by the pool. + let priority = queue + .jobs + .get(job) + .map(|data| data.priority) + .unwrap_or_else(|| { + // job is inserted upon enqueue and removed on concluded signal; + // this is enclosed in the if statement that narrows the situation to before + // conclusion; + // that means that the job still exists and is known; + // this path cannot be hit; + // qed. + never!("the job of the ripped worker must be known but it is not"); + Priority::Normal + }); + queue.unscheduled.readd(priority, job); + } + + // If there are still jobs left, spawn another worker to replace the ripped one (but only if it + // was indeed removed). That is unconditionally not critical just to not accidentally fill up + // the pool up to the hard cap. + if worker_data.is_some() && !queue.unscheduled.is_empty() { + spawn_extra_worker(queue, false).await?; + } + Ok(()) +} + +/// Spawns an extra worker if possible. +async fn spawn_extra_worker(queue: &mut Queue, critical: bool) -> Result<(), Fatal> { + if queue + .limits + .can_afford_one_more(queue.workers.len() + queue.spawn_inflight, critical) + { + queue.spawn_inflight += 1; + send_pool(&mut queue.to_pool_tx, pool::ToPool::Spawn).await?; + } + + Ok(()) +} + +/// Attaches the work to the given worker telling the poll about the job. +async fn assign(queue: &mut Queue, worker: Worker, job: Job) -> Result<(), Fatal> { + let job_data = &mut queue.jobs[job]; + + let artifact_id = job_data.pvf.as_artifact_id(); + let artifact_path = artifact_id.path(&queue.cache_path); + + job_data.worker = Some(worker); + + queue.workers[worker].job = Some(job); + + send_pool( + &mut queue.to_pool_tx, + pool::ToPool::StartWork { + worker, + code: job_data.pvf.code.clone(), + artifact_path, + background_priority: job_data.priority.is_background(), + }, + ) + .await?; + + Ok(()) +} + +fn reply(from_queue_tx: &mut mpsc::UnboundedSender, m: FromQueue) -> Result<(), Fatal> { + from_queue_tx.unbounded_send(m).map_err(|_| { + // The host has hung up and thus it's fatal and we should shutdown ourselves. + Fatal + }) +} + +async fn send_pool( + to_pool_tx: &mut mpsc::Sender, + m: pool::ToPool, +) -> Result<(), Fatal> { + to_pool_tx.send(m).await.map_err(|_| { + // The pool has hung up and thus we are no longer are able to fulfill our duties. Shutdown. + Fatal + }) +} + +/// Spins up the queue and returns the future that should be polled to make the queue functional. +pub fn start( + soft_capacity: usize, + hard_capacity: usize, + cache_path: PathBuf, + to_pool_tx: mpsc::Sender, + from_pool_rx: mpsc::UnboundedReceiver, +) -> ( + mpsc::Sender, + mpsc::UnboundedReceiver, + impl Future, +) { + let (to_queue_tx, to_queue_rx) = mpsc::channel(150); + let (from_queue_tx, from_queue_rx) = mpsc::unbounded(); + + let run = Queue::new( + soft_capacity, + hard_capacity, + cache_path, + to_queue_rx, + from_queue_tx, + to_pool_tx, + from_pool_rx, + ) + .run(); + + (to_queue_tx, from_queue_rx, run) +} + +#[cfg(test)] +mod tests { + use slotmap::SlotMap; + use assert_matches::assert_matches; + use futures::{FutureExt, future::BoxFuture}; + use std::task::Poll; + use super::*; + + /// Creates a new pvf which artifact id can be uniquely identified by the given number. + fn pvf(descriminator: u32) -> Pvf { + Pvf::from_discriminator(descriminator) + } + + async fn run_until( + task: &mut (impl Future + Unpin), + mut fut: (impl Future + Unpin), + ) -> R { + let start = std::time::Instant::now(); + let fut = &mut fut; + loop { + if start.elapsed() > std::time::Duration::from_secs(1) { + // We expect that this will take only a couple of iterations and thus to take way + // less than a second. + panic!("timeout"); + } + + if let Poll::Ready(r) = futures::poll!(&mut *fut) { + break r; + } + + if futures::poll!(&mut *task).is_ready() { + panic!() + } + } + } + + struct Test { + _tempdir: tempfile::TempDir, + run: BoxFuture<'static, ()>, + workers: SlotMap, + from_pool_tx: mpsc::UnboundedSender, + to_pool_rx: mpsc::Receiver, + to_queue_tx: mpsc::Sender, + from_queue_rx: mpsc::UnboundedReceiver, + } + + impl Test { + fn new(soft_capacity: usize, hard_capacity: usize) -> Self { + let tempdir = tempfile::tempdir().unwrap(); + + let (to_pool_tx, to_pool_rx) = mpsc::channel(10); + let (from_pool_tx, from_pool_rx) = mpsc::unbounded(); + + let workers: SlotMap = SlotMap::with_key(); + + let (to_queue_tx, from_queue_rx, run) = start( + soft_capacity, + hard_capacity, + tempdir.path().to_owned().into(), + to_pool_tx, + from_pool_rx, + ); + + Self { + _tempdir: tempdir, + run: run.boxed(), + workers, + from_pool_tx, + to_pool_rx, + to_queue_tx, + from_queue_rx, + } + } + + fn send_queue(&mut self, to_queue: ToQueue) { + self.to_queue_tx + .send(to_queue) + .now_or_never() + .unwrap() + .unwrap(); + } + + async fn poll_and_recv_from_queue(&mut self) -> FromQueue { + let from_queue_rx = &mut self.from_queue_rx; + run_until( + &mut self.run, + async { from_queue_rx.next().await.unwrap() }.boxed(), + ) + .await + } + + fn send_from_pool(&mut self, from_pool: pool::FromPool) { + self.from_pool_tx + .send(from_pool) + .now_or_never() + .unwrap() + .unwrap(); + } + + async fn poll_and_recv_to_pool(&mut self) -> pool::ToPool { + let to_pool_rx = &mut self.to_pool_rx; + run_until( + &mut self.run, + async { to_pool_rx.next().await.unwrap() }.boxed(), + ) + .await + } + + async fn poll_ensure_to_pool_is_empty(&mut self) { + use futures_timer::Delay; + use std::time::Duration; + + let to_pool_rx = &mut self.to_pool_rx; + run_until( + &mut self.run, + async { + futures::select! { + _ = Delay::new(Duration::from_millis(500)).fuse() => (), + _ = to_pool_rx.next().fuse() => { + panic!("to pool supposed to be empty") + } + } + } + .boxed(), + ) + .await + } + } + + #[async_std::test] + async fn properly_concludes() { + let mut test = Test::new(2, 2); + + test.send_queue(ToQueue::Enqueue { + priority: Priority::Background, + pvf: pvf(1), + }); + assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn); + + let w = test.workers.insert(()); + test.send_from_pool(pool::FromPool::Spawned(w)); + test.send_from_pool(pool::FromPool::Concluded(w, false)); + + assert_eq!( + test.poll_and_recv_from_queue().await, + FromQueue::Prepared(pvf(1).as_artifact_id()) + ); + } + + #[async_std::test] + async fn dont_spawn_over_soft_limit_unless_critical() { + let mut test = Test::new(2, 3); + + test.send_queue(ToQueue::Enqueue { + priority: Priority::Normal, + pvf: pvf(1), + }); + test.send_queue(ToQueue::Enqueue { + priority: Priority::Normal, + pvf: pvf(2), + }); + test.send_queue(ToQueue::Enqueue { + priority: Priority::Normal, + pvf: pvf(3), + }); + + // Receive only two spawns. + assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn); + assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn); + + let w1 = test.workers.insert(()); + let w2 = test.workers.insert(()); + + test.send_from_pool(pool::FromPool::Spawned(w1)); + test.send_from_pool(pool::FromPool::Spawned(w2)); + + // Get two start works. + assert_matches!( + test.poll_and_recv_to_pool().await, + pool::ToPool::StartWork { .. } + ); + assert_matches!( + test.poll_and_recv_to_pool().await, + pool::ToPool::StartWork { .. } + ); + + test.send_from_pool(pool::FromPool::Concluded(w1, false)); + + assert_matches!( + test.poll_and_recv_to_pool().await, + pool::ToPool::StartWork { .. } + ); + + // Enqueue a critical job. + test.send_queue(ToQueue::Enqueue { + priority: Priority::Critical, + pvf: pvf(4), + }); + + // 2 out of 2 are working, but there is a critical job incoming. That means that spawning + // another worker is warranted. + assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn); + } + + #[async_std::test] + async fn cull_unwanted() { + let mut test = Test::new(1, 2); + + test.send_queue(ToQueue::Enqueue { + priority: Priority::Normal, + pvf: pvf(1), + }); + assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn); + let w1 = test.workers.insert(()); + test.send_from_pool(pool::FromPool::Spawned(w1)); + assert_matches!( + test.poll_and_recv_to_pool().await, + pool::ToPool::StartWork { .. } + ); + + // Enqueue a critical job, which warrants spawning over the soft limit. + test.send_queue(ToQueue::Enqueue { + priority: Priority::Critical, + pvf: pvf(2), + }); + assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn); + + // However, before the new worker had a chance to spawn, the first worker finishes with its + // job. The old worker will be killed while the new worker will be let live, even though + // it's not instantiated. + // + // That's a bit silly in this context, but in production there will be an entire pool up + // to the `soft_capacity` of workers and it doesn't matter which one to cull. Either way, + // we just check that edge case of an edge case works. + test.send_from_pool(pool::FromPool::Concluded(w1, false)); + assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Kill(w1)); + } + + #[async_std::test] + async fn bump_prio_on_urgency_change() { + let mut test = Test::new(2, 2); + + test.send_queue(ToQueue::Enqueue { + priority: Priority::Background, + pvf: pvf(1), + }); + + assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn); + + let w = test.workers.insert(()); + test.send_from_pool(pool::FromPool::Spawned(w)); + + assert_matches!( + test.poll_and_recv_to_pool().await, + pool::ToPool::StartWork { .. } + ); + test.send_queue(ToQueue::Amend { + priority: Priority::Normal, + artifact_id: pvf(1).as_artifact_id(), + }); + + assert_eq!( + test.poll_and_recv_to_pool().await, + pool::ToPool::BumpPriority(w) + ); + } + + #[async_std::test] + async fn worker_mass_die_out_doesnt_stall_queue() { + let mut test = Test::new(2, 2); + + test.send_queue(ToQueue::Enqueue { + priority: Priority::Normal, + pvf: pvf(1), + }); + test.send_queue(ToQueue::Enqueue { + priority: Priority::Normal, + pvf: pvf(2), + }); + test.send_queue(ToQueue::Enqueue { + priority: Priority::Normal, + pvf: pvf(3), + }); + + assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn); + assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn); + + let w1 = test.workers.insert(()); + let w2 = test.workers.insert(()); + + test.send_from_pool(pool::FromPool::Spawned(w1)); + test.send_from_pool(pool::FromPool::Spawned(w2)); + + assert_matches!( + test.poll_and_recv_to_pool().await, + pool::ToPool::StartWork { .. } + ); + assert_matches!( + test.poll_and_recv_to_pool().await, + pool::ToPool::StartWork { .. } + ); + + // Conclude worker 1 and rip it. + test.send_from_pool(pool::FromPool::Concluded(w1, true)); + + // Since there is still work, the queue requested one extra worker to spawn to handle the + // remaining enqueued work items. + assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn); + assert_eq!( + test.poll_and_recv_from_queue().await, + FromQueue::Prepared(pvf(1).as_artifact_id()) + ); + } + + #[async_std::test] + async fn doesnt_resurrect_ripped_worker_if_no_work() { + let mut test = Test::new(2, 2); + + test.send_queue(ToQueue::Enqueue { + priority: Priority::Normal, + pvf: pvf(1), + }); + + assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn); + + let w1 = test.workers.insert(()); + test.send_from_pool(pool::FromPool::Spawned(w1)); + + assert_matches!( + test.poll_and_recv_to_pool().await, + pool::ToPool::StartWork { .. } + ); + + test.send_from_pool(pool::FromPool::Concluded(w1, true)); + test.poll_ensure_to_pool_is_empty().await; + } + + #[async_std::test] + async fn rip_for_start_work() { + let mut test = Test::new(2, 2); + + test.send_queue(ToQueue::Enqueue { + priority: Priority::Normal, + pvf: pvf(1), + }); + + assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn); + + let w1 = test.workers.insert(()); + test.send_from_pool(pool::FromPool::Spawned(w1)); + + // Now, to the interesting part. After the queue normally issues the start_work command to + // the pool, before receiving the command the queue may report that the worker ripped. + assert_matches!( + test.poll_and_recv_to_pool().await, + pool::ToPool::StartWork { .. } + ); + test.send_from_pool(pool::FromPool::Rip(w1)); + + // In this case, the pool should spawn a new worker and request it to work on the item. + assert_eq!(test.poll_and_recv_to_pool().await, pool::ToPool::Spawn); + + let w2 = test.workers.insert(()); + test.send_from_pool(pool::FromPool::Spawned(w2)); + assert_matches!( + test.poll_and_recv_to_pool().await, + pool::ToPool::StartWork { .. } + ); + } +} diff --git a/node/core/pvf/src/prepare/worker.rs b/node/core/pvf/src/prepare/worker.rs new file mode 100644 index 000000000000..9a6bdb1f8956 --- /dev/null +++ b/node/core/pvf/src/prepare/worker.rs @@ -0,0 +1,213 @@ +// Copyright 2021 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +use crate::{ + LOG_TARGET, + artifacts::Artifact, + worker_common::{ + IdleWorker, SpawnErr, WorkerHandle, bytes_to_path, framed_recv, framed_send, path_to_bytes, + spawn_with_program_path, tmpfile, worker_event_loop, + }, +}; +use async_std::{ + io, + os::unix::net::UnixStream, + path::{PathBuf, Path}, +}; +use futures::FutureExt as _; +use futures_timer::Delay; +use std::{sync::Arc, time::Duration}; + +const NICENESS_BACKGROUND: i32 = 10; +const NICENESS_FOREGROUND: i32 = 0; + +const COMPILATION_TIMEOUT: Duration = Duration::from_secs(10); + +/// Spawns a new worker with the given program path that acts as the worker and the spawn timeout. +/// +/// The program should be able to handle ` prepare-worker ` invocation. +pub async fn spawn( + program_path: &Path, + spawn_timeout: Duration, +) -> Result<(IdleWorker, WorkerHandle), SpawnErr> { + spawn_with_program_path( + "prepare", + program_path, + &["prepare-worker"], + spawn_timeout, + ) + .await +} + +pub enum Outcome { + /// The worker has finished the work assigned to it. + Concluded(IdleWorker), + /// The execution was interrupted abruptly and the worker is not available anymore. For example, + /// this could've happen because the worker hadn't finished the work until the given deadline. + /// + /// Note that in this case the artifact file is written (unless there was an error writing the + /// the artifact). + /// + /// This doesn't return an idle worker instance, thus this worker is no longer usable. + DidntMakeIt, +} + +/// Given the idle token of a worker and parameters of work, communicates with the worker and +/// returns the outcome. +pub async fn start_work( + worker: IdleWorker, + code: Arc>, + artifact_path: PathBuf, + background_priority: bool, +) -> Outcome { + let IdleWorker { mut stream, pid } = worker; + + tracing::debug!( + target: LOG_TARGET, + worker_pid = %pid, + %background_priority, + "starting prepare for {}", + artifact_path.display(), + ); + + if background_priority { + renice(pid, NICENESS_BACKGROUND); + } + + if let Err(err) = send_request(&mut stream, code).await { + tracing::warn!("failed to send a prepare request to pid={}: {:?}", pid, err); + return Outcome::DidntMakeIt; + } + + // Wait for the result from the worker, keeping in mind that there may be a timeout, the + // worker may get killed, or something along these lines. + // + // In that case we should handle these gracefully by writing the artifact file by ourselves. + // We may potentially overwrite the artifact in rare cases where the worker didn't make + // it to report back the result. + + enum Selected { + Done, + IoErr, + Deadline, + } + + let selected = futures::select! { + artifact_path_bytes = framed_recv(&mut stream).fuse() => { + match artifact_path_bytes { + Ok(bytes) => { + if let Some(tmp_path) = bytes_to_path(&bytes) { + async_std::fs::rename(tmp_path, &artifact_path) + .await + .map(|_| Selected::Done) + .unwrap_or(Selected::IoErr) + } else { + Selected::IoErr + } + }, + Err(_) => Selected::IoErr, + } + }, + _ = Delay::new(COMPILATION_TIMEOUT).fuse() => Selected::Deadline, + }; + + match selected { + Selected::Done => { + renice(pid, NICENESS_FOREGROUND); + Outcome::Concluded(IdleWorker { stream, pid }) + } + Selected::IoErr | Selected::Deadline => { + let bytes = Artifact::DidntMakeIt.serialize(); + // best effort: there is nothing we can do here if the write fails. + let _ = async_std::fs::write(&artifact_path, &bytes).await; + Outcome::DidntMakeIt + } + } +} + +async fn send_request(stream: &mut UnixStream, code: Arc>) -> io::Result<()> { + framed_send(stream, &*code).await +} + +async fn recv_request(stream: &mut UnixStream) -> io::Result> { + framed_recv(stream).await +} + +pub fn bump_priority(handle: &WorkerHandle) { + let pid = handle.id(); + renice(pid, NICENESS_FOREGROUND); +} + +fn renice(pid: u32, niceness: i32) { + tracing::debug!( + target: LOG_TARGET, + worker_pid = %pid, + "changing niceness to {}", + niceness, + ); + + // Consider upstreaming this to the `nix` crate. + unsafe { + if -1 == libc::setpriority(libc::PRIO_PROCESS, pid, niceness) { + let err = std::io::Error::last_os_error(); + tracing::warn!(target: LOG_TARGET, "failed to set the priority: {:?}", err,); + } + } +} + +/// The entrypoint that the spawned prepare worker should start with. The socket_path specifies +/// the path to the socket used to communicate with the host. +pub fn worker_entrypoint(socket_path: &str) { + worker_event_loop("prepare", socket_path, |mut stream| async move { + loop { + let code = recv_request(&mut stream).await?; + + tracing::debug!( + target: LOG_TARGET, + worker_pid = %std::process::id(), + "worker: preparing artifact", + ); + let artifact_bytes = prepare_artifact(&code).serialize(); + + // Write the serialized artifact into into a temp file. + let dest = tmpfile("prepare-artifact-").await?; + tracing::debug!( + target: LOG_TARGET, + worker_pid = %std::process::id(), + "worker: writing artifact to {}", + dest.display(), + ); + async_std::fs::write(&dest, &artifact_bytes).await?; + + // Communicate the results back to the host. + framed_send(&mut stream, &path_to_bytes(&dest)).await?; + } + }); +} + +fn prepare_artifact(code: &[u8]) -> Artifact { + let blob = match crate::executor_intf::prevalidate(code) { + Err(err) => { + return Artifact::PrevalidationErr(format!("{:?}", err)); + } + Ok(b) => b, + }; + + match crate::executor_intf::prepare(blob) { + Ok(compiled_artifact) => Artifact::Compiled { compiled_artifact }, + Err(err) => Artifact::PreparationErr(format!("{:?}", err)), + } +} diff --git a/node/core/pvf/src/priority.rs b/node/core/pvf/src/priority.rs new file mode 100644 index 000000000000..8ba7b3907257 --- /dev/null +++ b/node/core/pvf/src/priority.rs @@ -0,0 +1,46 @@ +// Copyright 2021 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +/// A priority assigned to execution of a PVF. +#[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)] +pub enum Priority { + /// Jobs in this priority will be executed in the background, meaning that they will be only + /// given spare CPU time. + /// + /// This is mainly for cache warmings. + Background, + /// Normal priority for things that do not require immediate response, but still need to be + /// done pretty quick. + /// + /// Approvals and disputes fall into this category. + Normal, + /// This priority is used for requests that are required to be processed as soon as possible. + /// + /// For example, backing is on critical path and require execution as soon as possible. + Critical, +} + +impl Priority { + /// Returns `true` if `self` is `Crticial` + pub fn is_critical(self) -> bool { + self == Priority::Critical + } + + /// Returns `true` if `self` is `Background` + pub fn is_background(self) -> bool { + self == Priority::Background + } +} diff --git a/node/core/pvf/src/pvf.rs b/node/core/pvf/src/pvf.rs new file mode 100644 index 000000000000..7c0a70ac90bc --- /dev/null +++ b/node/core/pvf/src/pvf.rs @@ -0,0 +1,56 @@ +// Copyright 2021 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +use crate::artifacts::ArtifactId; +use polkadot_core_primitives::Hash; +use sp_core::blake2_256; +use std::{fmt, sync::Arc}; + +/// A struct that carries code of a parachain validation function and it's hash. +/// +/// Should be cheap to clone. +#[derive(Clone)] +pub struct Pvf { + pub(crate) code: Arc>, + pub(crate) code_hash: Hash, +} + +impl fmt::Debug for Pvf { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "Pvf {{ code, code_hash: {:?} }}", self.code_hash) + } +} + +impl Pvf { + /// Returns an instance of the PVF out of the given PVF code. + pub fn from_code(code: Vec) -> Self { + let code = Arc::new(code); + let code_hash = blake2_256(&code).into(); + Self { code, code_hash } + } + + /// Creates a new pvf which artifact id can be uniquely identified by the given number. + #[cfg(test)] + pub(crate) fn from_discriminator(num: u32) -> Self { + let descriminator_buf = num.to_le_bytes().to_vec(); + Pvf::from_code(descriminator_buf) + } + + /// Returns the artifact ID that corresponds to this PVF. + pub(crate) fn as_artifact_id(&self) -> ArtifactId { + ArtifactId::new(self.code_hash) + } +} diff --git a/node/core/pvf/src/testing.rs b/node/core/pvf/src/testing.rs new file mode 100644 index 000000000000..7383b3136951 --- /dev/null +++ b/node/core/pvf/src/testing.rs @@ -0,0 +1,70 @@ +// Copyright 2021 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +//! Various things for testing other crates. +//! +//! N.B. This is not guarded with some feature flag. Overexposing items here may affect the final +//! artifact even for production builds. + +pub mod worker_common { + pub use crate::worker_common::{spawn_with_program_path, SpawnErr}; +} + +/// A function that emulates the stitches together behaviors of the preparation and the execution +/// worker in a single synchronous function. +pub fn validate_candidate( + code: &[u8], + params: &[u8], +) -> Result, Box> { + use crate::executor_intf::{prevalidate, prepare, execute, TaskExecutor}; + + let blob = prevalidate(code)?; + let artifact = prepare(blob)?; + let executor = TaskExecutor::new()?; + let result = execute(&artifact, params, executor)?; + + Ok(result) +} + +/// Use this macro to declare a `fn main() {}` that will check the arguments and dispatch them to +/// the appropriate worker, making the executable that can be used for spawning workers. +#[macro_export] +macro_rules! decl_puppet_worker_main { + () => { + fn main() { + let args = std::env::args().collect::>(); + if args.len() < 2 { + panic!("wrong number of arguments"); + } + + let subcommand = &args[1]; + match subcommand.as_ref() { + "sleep" => { + std::thread::sleep(std::time::Duration::from_secs(5)); + } + "prepare-worker" => { + let socket_path = &args[2]; + $crate::prepare_worker_entrypoint(socket_path); + } + "execute-worker" => { + let socket_path = &args[2]; + $crate::execute_worker_entrypoint(socket_path); + } + other => panic!("unknown subcommand: {}", other), + } + } + }; +} diff --git a/node/core/pvf/src/worker_common.rs b/node/core/pvf/src/worker_common.rs new file mode 100644 index 000000000000..6818baa18f9a --- /dev/null +++ b/node/core/pvf/src/worker_common.rs @@ -0,0 +1,294 @@ +// Copyright 2021 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +//! Common logic for implementation of worker processes. + +use crate::LOG_TARGET; +use async_std::{ + io, + os::unix::net::{UnixListener, UnixStream}, + path::{PathBuf, Path}, +}; +use futures::{ + AsyncRead, AsyncWrite, AsyncReadExt as _, AsyncWriteExt as _, FutureExt as _, never::Never, +}; +use futures_timer::Delay; +use rand::Rng; +use std::{ + fmt, mem, + pin::Pin, + task::{Context, Poll}, + time::Duration, +}; +use pin_project::pin_project; + +/// This is publicly exposed only for integration tests. +#[doc(hidden)] +pub async fn spawn_with_program_path( + debug_id: &'static str, + program_path: impl Into, + extra_args: &'static [&'static str], + spawn_timeout: Duration, +) -> Result<(IdleWorker, WorkerHandle), SpawnErr> { + let program_path = program_path.into(); + with_transient_socket_path(debug_id, |socket_path| { + let socket_path = socket_path.to_owned(); + async move { + let listener = UnixListener::bind(&socket_path) + .await + .map_err(|_| SpawnErr::Bind)?; + + let handle = WorkerHandle::spawn(program_path, extra_args, socket_path) + .map_err(|_| SpawnErr::ProcessSpawn)?; + + futures::select! { + accept_result = listener.accept().fuse() => { + let (stream, _) = accept_result.map_err(|_| SpawnErr::Accept)?; + Ok((IdleWorker { stream, pid: handle.id() }, handle)) + } + _ = Delay::new(spawn_timeout).fuse() => { + Err(SpawnErr::AcceptTimeout) + } + } + } + }) + .await +} + +async fn with_transient_socket_path(debug_id: &'static str, f: F) -> Result +where + F: FnOnce(&Path) -> Fut, + Fut: futures::Future> + 'static, +{ + let socket_path = tmpfile(&format!("pvf-host-{}", debug_id)) + .await + .map_err(|_| SpawnErr::TmpFile)?; + let result = f(&socket_path).await; + + // Best effort to remove the socket file. Under normal circumstances the socket will be removed + // by the worker. We make sure that it is removed here, just in case a failed rendezvous. + let _ = async_std::fs::remove_file(socket_path).await; + + result +} + +/// Returns a path under the location for temporary files. The file name will start with the given +/// prefix. +/// +/// There is only a certain number of retries. If exceeded this function will give up and return an +/// error. +pub async fn tmpfile(prefix: &str) -> io::Result { + fn tmppath(prefix: &str) -> PathBuf { + use rand::distributions::Alphanumeric; + + const DESCRIMINATOR_LEN: usize = 10; + + let mut buf = Vec::with_capacity(prefix.len() + DESCRIMINATOR_LEN); + buf.extend(prefix.as_bytes()); + buf.extend( + rand::thread_rng() + .sample_iter(&Alphanumeric) + .take(DESCRIMINATOR_LEN), + ); + + let s = std::str::from_utf8(&buf) + .expect("the string is collected from a valid utf-8 sequence; qed"); + + let mut temp_dir = PathBuf::from(std::env::temp_dir()); + temp_dir.push(s); + temp_dir + } + + const NUM_RETRIES: usize = 50; + + for _ in 0..NUM_RETRIES { + let candidate_path = tmppath(prefix); + if !candidate_path.exists().await { + return Ok(candidate_path) + } + } + + Err( + io::Error::new(io::ErrorKind::Other, "failed to create a temporary file") + ) +} + +pub fn worker_event_loop(debug_id: &'static str, socket_path: &str, mut event_loop: F) +where + F: FnMut(UnixStream) -> Fut, + Fut: futures::Future>, +{ + let err = async_std::task::block_on::<_, io::Result>(async move { + let stream = UnixStream::connect(socket_path).await?; + let _ = async_std::fs::remove_file(socket_path).await; + + event_loop(stream).await + }) + .unwrap_err(); // it's never `Ok` because it's `Ok(Never)` + + tracing::debug!( + target: LOG_TARGET, + worker_pid = %std::process::id(), + "pvf worker ({}): {:?}", + debug_id, + err, + ); +} + +/// A struct that represents an idle worker. +/// +/// This struct is supposed to be used as a token that is passed by move into a subroutine that +/// initiates a job. If the worker dies on the duty, then the token is not returned back. +#[derive(Debug)] +pub struct IdleWorker { + /// The stream to which the child process is connected. + pub stream: UnixStream, + + /// The identifier of this process. Used to reset the niceness. + pub pid: u32, +} + +/// An error happened during spawning a worker process. +#[derive(Clone, Debug)] +pub enum SpawnErr { + /// Cannot obtain a temporary file location. + TmpFile, + /// Cannot bind the socket to the given path. + Bind, + /// An error happened during accepting a connection to the socket. + Accept, + /// An error happened during spawning the process. + ProcessSpawn, + /// The deadline alloted for the worker spawning and connecting to the socket has elapsed. + AcceptTimeout, +} + +/// This is a representation of a potentially running worker. Drop it and the process will be killed. +/// +/// A worker's handle is also a future that resolves when it's detected that the worker's process +/// has been terminated. Since the worker is running in another process it is obviously not necessarily +/// to poll this future to make the worker run, it's only for termination detection. +/// +/// This future relies on the fact that a child process's stdout fd is closed upon it's termination. +#[pin_project] +pub struct WorkerHandle { + child: async_process::Child, + #[pin] + stdout: async_process::ChildStdout, + drop_box: Box<[u8]>, +} + +impl WorkerHandle { + fn spawn( + program: impl AsRef, + extra_args: &[&str], + socket_path: impl AsRef, + ) -> io::Result { + let mut child = async_process::Command::new(program.as_ref()) + .args(extra_args) + .arg(socket_path.as_ref().as_os_str()) + .stdout(async_process::Stdio::piped()) + .kill_on_drop(true) + .spawn()?; + + let stdout = child + .stdout + .take() + .expect("the process spawned with piped stdout should have the stdout handle"); + + Ok(WorkerHandle { + child, + stdout, + // We don't expect the bytes to be ever read. But in case we do, we should not use a buffer + // of a small size, because otherwise if the child process does return any data we will end up + // issuing a syscall for each byte. We also prefer not to do allocate that on the stack, since + // each poll the buffer will be allocated and initialized (and that's due poll_read takes &mut [u8] + // and there are no guarantees that a `poll_read` won't ever read from there even though that's + // unlikely). + // + // OTOH, we also don't want to be super smart here and we could just afford to allocate a buffer + // for that here. + drop_box: vec![0; 8192].into_boxed_slice(), + }) + } + + /// Returns the process id of this worker. + pub fn id(&self) -> u32 { + self.child.id() + } +} + +impl futures::Future for WorkerHandle { + type Output = (); + + fn poll(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll { + let me = self.project(); + match futures::ready!(AsyncRead::poll_read(me.stdout, cx, &mut *me.drop_box)) { + Ok(0) => { + // 0 means EOF means the child was terminated. Resolve. + Poll::Ready(()) + } + Ok(_bytes_read) => { + // weird, we've read something. Pretend that never happened and reschedule ourselves. + cx.waker().wake_by_ref(); + Poll::Pending + } + Err(_) => { + // The implementation is guaranteed to not to return WouldBlock and Interrupted. This + // leaves us with a legit errors which we suppose were due to termination. + Poll::Ready(()) + } + } + } +} + +impl fmt::Debug for WorkerHandle { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "WorkerHandle(pid={})", self.id()) + } +} + +/// Convert the given path into a byte buffer. +pub fn path_to_bytes(path: &Path) -> &[u8] { + // Ideally, we take the OsStr of the path, send that and reconstruct this on the other side. + // However, libstd doesn't provide us with such an option. There are crates out there that + // allow for extraction of a path, but TBH it doesn't seem to be a real issue. + // + // However, should be there reports we can incorporate such a crate here. + path.to_str().expect("non-UTF-8 path").as_bytes() +} + +/// Interprets the given bytes as a path. Returns `None` if the given bytes do not constitute a +/// a proper utf-8 string. +pub fn bytes_to_path(bytes: &[u8]) -> Option { + std::str::from_utf8(bytes).ok().map(PathBuf::from) +} + +pub async fn framed_send(w: &mut (impl AsyncWrite + Unpin), buf: &[u8]) -> io::Result<()> { + let len_buf = buf.len().to_le_bytes(); + w.write_all(&len_buf).await?; + w.write_all(buf).await?; + Ok(()) +} + +pub async fn framed_recv(r: &mut (impl AsyncRead + Unpin)) -> io::Result> { + let mut len_buf = [0u8; mem::size_of::()]; + r.read_exact(&mut len_buf).await?; + let len = usize::from_le_bytes(len_buf); + let mut buf = vec![0; len]; + r.read_exact(&mut buf).await?; + Ok(buf) +} diff --git a/node/core/pvf/tests/it/adder.rs b/node/core/pvf/tests/it/adder.rs new file mode 100644 index 000000000000..97af2ef4efca --- /dev/null +++ b/node/core/pvf/tests/it/adder.rs @@ -0,0 +1,131 @@ +// Copyright 2021 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +use super::TestHost; +use polkadot_parachain::{ + primitives::{ + RelayChainBlockNumber, BlockData as GenericBlockData, HeadData as GenericHeadData, + ValidationParams, + }, +}; +use parity_scale_codec::{Decode, Encode}; +use adder::{HeadData, BlockData, hash_state}; + +#[async_std::test] +async fn execute_good_on_parent() { + let parent_head = HeadData { + number: 0, + parent_hash: [0; 32], + post_state: hash_state(0), + }; + + let block_data = BlockData { state: 0, add: 512 }; + + let host = TestHost::new(); + + let ret = host + .validate_candidate( + adder::wasm_binary_unwrap(), + ValidationParams { + parent_head: GenericHeadData(parent_head.encode()), + block_data: GenericBlockData(block_data.encode()), + relay_parent_number: 1, + relay_parent_storage_root: Default::default(), + }, + ) + .await + .unwrap(); + + let new_head = HeadData::decode(&mut &ret.head_data.0[..]).unwrap(); + + assert_eq!(new_head.number, 1); + assert_eq!(new_head.parent_hash, parent_head.hash()); + assert_eq!(new_head.post_state, hash_state(512)); +} + +#[async_std::test] +async fn execute_good_chain_on_parent() { + let mut number = 0; + let mut parent_hash = [0; 32]; + let mut last_state = 0; + + let host = TestHost::new(); + + for add in 0..10 { + let parent_head = HeadData { + number, + parent_hash, + post_state: hash_state(last_state), + }; + + let block_data = BlockData { + state: last_state, + add, + }; + + let ret = host + .validate_candidate( + adder::wasm_binary_unwrap(), + ValidationParams { + parent_head: GenericHeadData(parent_head.encode()), + block_data: GenericBlockData(block_data.encode()), + relay_parent_number: number as RelayChainBlockNumber + 1, + relay_parent_storage_root: Default::default(), + }, + ) + .await + .unwrap(); + + let new_head = HeadData::decode(&mut &ret.head_data.0[..]).unwrap(); + + assert_eq!(new_head.number, number + 1); + assert_eq!(new_head.parent_hash, parent_head.hash()); + assert_eq!(new_head.post_state, hash_state(last_state + add)); + + number += 1; + parent_hash = new_head.hash(); + last_state += add; + } +} + +#[async_std::test] +async fn execute_bad_on_parent() { + let parent_head = HeadData { + number: 0, + parent_hash: [0; 32], + post_state: hash_state(0), + }; + + let block_data = BlockData { + state: 256, // start state is wrong. + add: 256, + }; + + let host = TestHost::new(); + + let _ret = host + .validate_candidate( + adder::wasm_binary_unwrap(), + ValidationParams { + parent_head: GenericHeadData(parent_head.encode()), + block_data: GenericBlockData(block_data.encode()), + relay_parent_number: 1, + relay_parent_storage_root: Default::default(), + }, + ) + .await + .unwrap_err(); +} diff --git a/node/core/pvf/tests/it/main.rs b/node/core/pvf/tests/it/main.rs new file mode 100644 index 000000000000..6ea41b11d531 --- /dev/null +++ b/node/core/pvf/tests/it/main.rs @@ -0,0 +1,152 @@ +// Copyright 2021 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +use polkadot_node_core_pvf::{Pvf, ValidationHost, start, Config, InvalidCandidate, ValidationError}; +use polkadot_parachain::{ + primitives::{BlockData, ValidationParams, ValidationResult}, +}; +use parity_scale_codec::Encode as _; +use async_std::sync::Mutex; + +mod adder; +mod worker_common; + +const PUPPET_EXE: &str = env!("CARGO_BIN_EXE_puppet_worker"); + +struct TestHost { + _cache_dir: tempfile::TempDir, + host: Mutex, +} + +impl TestHost { + fn new() -> Self { + Self::new_with_config(|_| ()) + } + + fn new_with_config(f: F) -> Self + where + F: FnOnce(&mut Config), + { + let cache_dir = tempfile::tempdir().unwrap(); + let program_path = std::path::PathBuf::from(PUPPET_EXE); + let mut config = Config::new(cache_dir.path().to_owned(), program_path); + f(&mut config); + let (host, task) = start(config); + let _ = async_std::task::spawn(task); + Self { + _cache_dir: cache_dir, + host: Mutex::new(host), + } + } + + async fn validate_candidate( + &self, + code: &[u8], + params: ValidationParams, + ) -> Result { + let (result_tx, result_rx) = futures::channel::oneshot::channel(); + self.host + .lock() + .await + .execute_pvf( + Pvf::from_code(code.to_vec()), + params.encode(), + polkadot_node_core_pvf::Priority::Normal, + result_tx, + ) + .await + .unwrap(); + result_rx.await.unwrap() + } +} + +#[async_std::test] +async fn terminates_on_timeout() { + let host = TestHost::new(); + + let result = host + .validate_candidate( + halt::wasm_binary_unwrap(), + ValidationParams { + block_data: BlockData(Vec::new()), + parent_head: Default::default(), + relay_parent_number: 1, + relay_parent_storage_root: Default::default(), + }, + ) + .await; + + match result { + Err(ValidationError::InvalidCandidate(InvalidCandidate::HardTimeout)) => {} + r => panic!("{:?}", r), + } +} + +#[async_std::test] +async fn parallel_execution() { + let host = TestHost::new(); + let execute_pvf_future_1 = host.validate_candidate( + halt::wasm_binary_unwrap(), + ValidationParams { + block_data: BlockData(Vec::new()), + parent_head: Default::default(), + relay_parent_number: 1, + relay_parent_storage_root: Default::default(), + }, + ); + let execute_pvf_future_2 = host.validate_candidate( + halt::wasm_binary_unwrap(), + ValidationParams { + block_data: BlockData(Vec::new()), + parent_head: Default::default(), + relay_parent_number: 1, + relay_parent_storage_root: Default::default(), + }, + ); + + let start = std::time::Instant::now(); + let (_, _) = futures::join!(execute_pvf_future_1, execute_pvf_future_2); + + // total time should be < 2 x EXECUTION_TIMEOUT_SEC + const EXECUTION_TIMEOUT_SEC: u64 = 3; + assert!( + std::time::Instant::now().duration_since(start) + < std::time::Duration::from_secs(EXECUTION_TIMEOUT_SEC * 2) + ); +} + +#[async_std::test] +async fn execute_queue_doesnt_stall_if_workers_died() { + let host = TestHost::new_with_config(|cfg| { + assert_eq!(cfg.execute_workers_max_num, 5); + }); + + // Here we spawn 8 validation jobs for the `halt` PVF and share those between 5 workers. The + // first five jobs should timeout and the workers killed. For the next 3 jobs a new batch of + // workers should be spun up. + futures::future::join_all((0u8..=8).map(|_| { + host.validate_candidate( + halt::wasm_binary_unwrap(), + ValidationParams { + block_data: BlockData(Vec::new()), + parent_head: Default::default(), + relay_parent_number: 1, + relay_parent_storage_root: Default::default(), + }, + ) + })) + .await; +} diff --git a/node/core/pvf/tests/it/worker_common.rs b/node/core/pvf/tests/it/worker_common.rs new file mode 100644 index 000000000000..ec77bd710441 --- /dev/null +++ b/node/core/pvf/tests/it/worker_common.rs @@ -0,0 +1,43 @@ +// Copyright 2021 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +use crate::PUPPET_EXE; +use polkadot_node_core_pvf::testing::worker_common::{spawn_with_program_path, SpawnErr}; +use std::time::Duration; + +#[async_std::test] +async fn spawn_timeout() { + let result = spawn_with_program_path( + "integration-test", + PUPPET_EXE, + &["sleep"], + Duration::from_secs(2), + ) + .await; + assert!(matches!(result, Err(SpawnErr::AcceptTimeout))); +} + +#[async_std::test] +async fn should_connect() { + let _ = spawn_with_program_path( + "integration-test", + PUPPET_EXE, + &["prepare-worker"], + Duration::from_secs(2), + ) + .await + .unwrap(); +} diff --git a/node/service/src/lib.rs b/node/service/src/lib.rs index f12f9a77bf53..fb31b808bbdb 100644 --- a/node/service/src/lib.rs +++ b/node/service/src/lib.rs @@ -31,6 +31,7 @@ use { polkadot_node_core_av_store::Config as AvailabilityConfig, polkadot_node_core_av_store::Error as AvailabilityError, polkadot_node_core_approval_voting::Config as ApprovalVotingConfig, + polkadot_node_core_candidate_validation::Config as CandidateValidationConfig, polkadot_node_core_proposer::ProposerFactory, polkadot_overseer::{AllSubsystems, BlockInfo, Overseer, OverseerHandler}, polkadot_primitives::v1::ParachainHost, @@ -60,7 +61,6 @@ use telemetry::{Telemetry, TelemetryWorker, TelemetryWorkerHandle}; pub use self::client::{AbstractClient, Client, ClientHandle, ExecuteWithClient, RuntimeApiCollection}; pub use chain_spec::{PolkadotChainSpec, KusamaChainSpec, WestendChainSpec, RococoChainSpec}; pub use consensus_common::{Proposal, SelectChain, BlockImport, block_validation::Chain}; -pub use polkadot_parachain::wasm_executor::IsolationStrategy; pub use polkadot_primitives::v1::{Block, BlockId, CollatorPair, Hash, Id as ParaId}; pub use sc_client_api::{Backend, ExecutionStrategy, CallExecutor}; pub use sc_consensus::LongestChain; @@ -426,7 +426,7 @@ fn real_overseer( registry: Option<&Registry>, spawner: Spawner, is_collator: IsCollator, - isolation_strategy: IsolationStrategy, + candidate_validation_config: CandidateValidationConfig, ) -> Result<(Overseer>, OverseerHandler), Error> where RuntimeClient: 'static + ProvideRuntimeApi + HeaderBackend + AuxStore, @@ -484,10 +484,9 @@ where keystore.clone(), Metrics::register(registry)?, ), - candidate_validation: CandidateValidationSubsystem::new( - spawner.clone(), + candidate_validation: CandidateValidationSubsystem::with_config( + candidate_validation_config, Metrics::register(registry)?, - isolation_strategy, ), chain_api: ChainApiSubsystem::new( runtime_client.clone(), @@ -673,8 +672,8 @@ pub fn new_full( is_collator: IsCollator, grandpa_pause: Option<(u32, u32)>, jaeger_agent: Option, - isolation_strategy: IsolationStrategy, telemetry_worker_handle: Option, + program_path: Option, ) -> Result>>, Error> where RuntimeApi: ConstructRuntimeApi> + Send + Sync + 'static, @@ -785,6 +784,17 @@ pub fn new_full( slot_duration_millis: slot_duration.as_millis() as u64, }; + let candidate_validation_config = CandidateValidationConfig { + artifacts_cache_path: config.database + .path() + .ok_or(Error::DatabasePathRequired)? + .join("pvf-artifacts"), + program_path: match program_path { + None => std::env::current_exe()?, + Some(p) => p, + }, + }; + let chain_spec = config.chain_spec.cloned_box(); let rpc_handlers = service::spawn_tasks(service::SpawnTasksParams { config, @@ -863,7 +873,7 @@ pub fn new_full( prometheus_registry.as_ref(), spawner, is_collator, - isolation_strategy, + candidate_validation_config, )?; let overseer_handler_clone = overseer_handler.clone(); @@ -1216,27 +1226,14 @@ pub fn build_full( jaeger_agent: Option, telemetry_worker_handle: Option, ) -> Result, Error> { - let isolation_strategy = { - #[cfg(not(any(target_os = "android", target_os = "unknown")))] - { - let cache_base_path = config.database.path(); - IsolationStrategy::external_process_with_caching(cache_base_path) - } - - #[cfg(any(target_os = "android", target_os = "unknown"))] - { - IsolationStrategy::InProcess - } - }; - if config.chain_spec.is_rococo() { new_full::( config, is_collator, grandpa_pause, jaeger_agent, - isolation_strategy, telemetry_worker_handle, + None, ).map(|full| full.with_client(Client::Rococo)) } else if config.chain_spec.is_kusama() { new_full::( @@ -1244,8 +1241,8 @@ pub fn build_full( is_collator, grandpa_pause, jaeger_agent, - isolation_strategy, telemetry_worker_handle, + None, ).map(|full| full.with_client(Client::Kusama)) } else if config.chain_spec.is_westend() { new_full::( @@ -1253,8 +1250,8 @@ pub fn build_full( is_collator, grandpa_pause, jaeger_agent, - isolation_strategy, telemetry_worker_handle, + None, ).map(|full| full.with_client(Client::Westend)) } else { new_full::( @@ -1262,8 +1259,8 @@ pub fn build_full( is_collator, grandpa_pause, jaeger_agent, - isolation_strategy, telemetry_worker_handle, + None, ).map(|full| full.with_client(Client::Polkadot)) } } diff --git a/node/test/service/src/lib.rs b/node/test/service/src/lib.rs index 153c4590b20c..c467dc332322 100644 --- a/node/test/service/src/lib.rs +++ b/node/test/service/src/lib.rs @@ -53,7 +53,7 @@ use sp_blockchain::HeaderBackend; use sp_keyring::Sr25519Keyring; use sp_runtime::{codec::Encode, generic, traits::IdentifyAccount, MultiSigner}; use sp_state_machine::BasicExternalities; -use std::sync::Arc; +use std::{sync::Arc, path::PathBuf}; use substrate_test_client::{BlockchainEventsExt, RpcHandlersExt, RpcTransactionOutput, RpcTransactionError}; native_executor_instance!( @@ -73,6 +73,7 @@ pub use polkadot_service::FullBackend; pub fn new_full( config: Configuration, is_collator: IsCollator, + worker_program_path: Option, ) -> Result< NewFull>, Error, @@ -82,8 +83,8 @@ pub fn new_full( is_collator, None, None, - polkadot_parachain::wasm_executor::IsolationStrategy::InProcess, None, + worker_program_path, ) } @@ -214,11 +215,12 @@ pub fn run_validator_node( key: Sr25519Keyring, storage_update_func: impl Fn(), boot_nodes: Vec, + worker_program_path: Option, ) -> PolkadotTestNode { let config = node_config(storage_update_func, task_executor, key, boot_nodes, true); let multiaddr = config.network.listen_addresses[0].clone(); let NewFull { task_manager, client, network, rpc_handlers, overseer_handler, .. } = - new_full(config, IsCollator::No).expect("could not create Polkadot test service"); + new_full(config, IsCollator::No, worker_program_path).expect("could not create Polkadot test service"); let overseer_handler = overseer_handler.expect("test node must have an overseer handler"); let peer_id = network.local_peer_id().clone(); @@ -261,7 +263,7 @@ pub fn run_collator_node( rpc_handlers, overseer_handler, .. - } = new_full(config, IsCollator::Yes(collator_pair)) + } = new_full(config, IsCollator::Yes(collator_pair), None) .expect("could not create Polkadot test service"); let overseer_handler = overseer_handler.expect("test node must have an overseer handler"); diff --git a/node/test/service/tests/build-blocks.rs b/node/test/service/tests/build-blocks.rs index 9eb9f4ff328c..b563a6e46a7e 100644 --- a/node/test/service/tests/build-blocks.rs +++ b/node/test/service/tests/build-blocks.rs @@ -30,12 +30,14 @@ async fn ensure_test_service_build_blocks(task_executor: TaskExecutor) { Sr25519Keyring::Alice, || {}, Vec::new(), + None, ); let mut bob = run_validator_node( task_executor.clone(), Sr25519Keyring::Bob, || {}, vec![alice.addr.clone()], + None, ); { diff --git a/node/test/service/tests/call-function.rs b/node/test/service/tests/call-function.rs index c6802234c9c8..fa9a161923e6 100644 --- a/node/test/service/tests/call-function.rs +++ b/node/test/service/tests/call-function.rs @@ -20,7 +20,7 @@ use sp_keyring::Sr25519Keyring::{Alice, Bob}; #[substrate_test_utils::test] async fn call_function_actually_work(task_executor: TaskExecutor) { - let alice = run_validator_node(task_executor, Alice, || {}, Vec::new()); + let alice = run_validator_node(task_executor, Alice, || {}, Vec::new(), None); let function = polkadot_test_runtime::Call::Balances(pallet_balances::Call::transfer( Default::default(), diff --git a/parachain/Cargo.toml b/parachain/Cargo.toml index 04b64bca6c57..31fd2e46e062 100644 --- a/parachain/Cargo.toml +++ b/parachain/Cargo.toml @@ -14,47 +14,21 @@ parity-util-mem = { version = "0.9.0", optional = true } sp-std = { git = "https://github.com/paritytech/substrate", branch = "master", default-features = false } sp-runtime = { git = "https://github.com/paritytech/substrate", branch = "master", default-features = false } sp-core = { git = "https://github.com/paritytech/substrate", branch = "master", default-features = false } -sp-wasm-interface = { git = "https://github.com/paritytech/substrate", branch = "master", default-features = false } polkadot-core-primitives = { path = "../core-primitives", default-features = false } derive_more = "0.99.11" # all optional crates. -thiserror = { version = "1.0.22", optional = true } serde = { version = "1.0.117", default-features = false, features = [ "derive" ], optional = true } -sp-externalities = { git = "https://github.com/paritytech/substrate", branch = "master", optional = true } -sc-executor = { git = "https://github.com/paritytech/substrate", branch = "master", optional = true } -sp-io = { git = "https://github.com/paritytech/substrate", branch = "master", optional = true } -parking_lot = { version = "0.11.1", optional = true } -log = { version = "0.4.11", optional = true } -futures = { version = "0.3.8", optional = true } -static_assertions = { version = "1.1", optional = true } -libc = { version = "0.2.81", optional = true } - -[target.'cfg(not(any(target_os = "android", target_os = "unknown")))'.dependencies] -shared_memory = { version = "0.11.0", optional = true } -raw_sync = { version = "0.1", optional = true } [features] default = ["std"] -wasmtime = [ "sc-executor/wasmtime" ] wasm-api = [] std = [ "parity-scale-codec/std", - "thiserror", "serde/std", "sp-std/std", "sp-runtime/std", - "shared_memory", - "raw_sync", "sp-core/std", - "parking_lot", - "static_assertions", - "log", - "libc", "parity-util-mem", - "sp-externalities", - "sc-executor", - "sp-io", "polkadot-core-primitives/std", - "futures", ] diff --git a/parachain/src/lib.rs b/parachain/src/lib.rs index 67a5a6839245..2cf441e88c9f 100644 --- a/parachain/src/lib.rs +++ b/parachain/src/lib.rs @@ -14,6 +14,8 @@ // You should have received a copy of the GNU General Public License // along with Polkadot. If not, see . +#![warn(unused_crate_dependencies)] + //! Defines primitive types for creating or validating a parachain. //! //! When compiled with standard library support, this crate exports a `wasm` @@ -43,8 +45,6 @@ #![cfg_attr(not(feature = "std"), no_std)] -#[cfg(feature = "std")] -pub mod wasm_executor; pub mod primitives; mod wasm_api; diff --git a/parachain/src/wasm_executor/mod.rs b/parachain/src/wasm_executor/mod.rs deleted file mode 100644 index a9f5f9bae074..000000000000 --- a/parachain/src/wasm_executor/mod.rs +++ /dev/null @@ -1,401 +0,0 @@ -// Copyright 2017-2020 Parity Technologies (UK) Ltd. -// This file is part of Polkadot. - -// Polkadot is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. - -// Polkadot is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. - -// You should have received a copy of the GNU General Public License -// along with Polkadot. If not, see . - -//! WASM re-execution of a parachain candidate. -//! In the context of relay-chain candidate evaluation, there are some additional -//! steps to ensure that the provided input parameters are correct. -//! Assuming the parameters are correct, this module provides a wrapper around -//! a WASM VM for re-execution of a parachain candidate. - -use std::{any::{TypeId, Any}, path::{Path, PathBuf}}; -use crate::primitives::{ValidationParams, ValidationResult}; -use parity_scale_codec::{Decode, Encode}; -use sp_core::{storage::{ChildInfo, TrackedStorageKey}, traits::{CallInWasm, SpawnNamed}}; -use sp_externalities::Extensions; -use sp_wasm_interface::HostFunctions as _; - -#[cfg(not(any(target_os = "android", target_os = "unknown")))] -pub use validation_host::{run_worker, ValidationPool, EXECUTION_TIMEOUT_SEC, WORKER_ARGS}; - -mod validation_host; - -/// The strategy we employ for isolating execution of wasm parachain validation function (PVF). -/// -/// For a typical validator an external process is the default way to run PVF. The rationale is based -/// on the following observations: -/// -/// (a) PVF is completely under control of parachain developers who may or may not be malicious. -/// (b) Collators are in charge of providing PoV who also may or may not be malicious. -/// (c) PVF is executed by a wasm engine based on optimizing compiler which is a very complex piece -/// of machinery. -/// -/// (a) and (b) may lead to a situation where due to a combination of PVF and PoV the validation work -/// can stuck in an infinite loop, which can open up resource exhaustion or DoS attack vectors. -/// -/// While some execution engines provide functionality to interrupt execution of wasm module from -/// another thread, there are also some caveats to that: there is no clean way to interrupt execution -/// if the control flow is in the host side and at the moment we haven't rigoriously vetted that all -/// host functions terminate or, at least, return in a short amount of time. Additionally, we want -/// some freedom on choosing wasm execution environment. -/// -/// On top of that, execution in a separate process helps to minimize impact of (c) if exploited. -/// It's not only the risk of miscompilation, but it also includes risk of JIT-bombs, i.e. cases -/// of specially crafted code that take enourmous amounts of time and memory to compile. -/// -/// At the same time, since PVF validates self-contained candidates, validation workers don't require -/// extensive communication with polkadot host, therefore there should be no observable performance penalty -/// coming from inter process communication. -/// -/// All of the above should give a sense why isolation is crucial for a typical use-case. -/// -/// However, in some cases, e.g. when running PVF validation on android (for whatever reason), we -/// cannot afford the luxury of process isolation and thus there is an option to run validation in -/// process. Also, running in process is convenient for testing. -#[derive(Clone, Debug)] -pub enum IsolationStrategy { - /// The validation worker is ran in a thread inside the same process. - InProcess, - /// The validation worker is ran using the process' executable and the subcommand `validation-worker` is passed - /// following by the address of the shared memory. - #[cfg(not(any(target_os = "android", target_os = "unknown")))] - ExternalProcessSelfHost { - pool: ValidationPool, - cache_base_path: Option, - }, - /// The validation worker is ran using the command provided and the argument provided. The address of the shared - /// memory is added at the end of the arguments. - #[cfg(not(any(target_os = "android", target_os = "unknown")))] - ExternalProcessCustomHost { - /// Validation pool. - pool: ValidationPool, - /// Path to the validation worker. The file must exists and be executable. - binary: PathBuf, - /// List of arguments passed to the validation worker. The address of the shared memory will be automatically - /// added after the arguments. - args: Vec, - }, -} - -impl IsolationStrategy { - #[cfg(not(any(target_os = "android", target_os = "unknown")))] - pub fn external_process_with_caching(cache_base_path: Option<&Path>) -> Self { - // Convert cache path to string here so that we don't have to do that each time we launch - // validation worker. - let cache_base_path = cache_base_path.map(|path| path.display().to_string()); - - Self::ExternalProcessSelfHost { - pool: ValidationPool::new(), - cache_base_path, - } - } -} - -#[derive(Debug, thiserror::Error)] -/// Candidate validation error. -pub enum ValidationError { - /// Validation failed due to internal reasons. The candidate might still be valid. - #[error(transparent)] - Internal(#[from] InternalError), - /// Candidate is invalid. - #[error(transparent)] - InvalidCandidate(#[from] InvalidCandidate), -} - -/// Error type that indicates invalid candidate. -#[derive(Debug, thiserror::Error)] -pub enum InvalidCandidate { - /// Wasm executor error. - #[error("WASM executor error")] - WasmExecutor(#[from] sc_executor::error::Error), - /// Call data is too large. - #[error("Validation parameters are {0} bytes, max allowed is {1}")] - ParamsTooLarge(usize, usize), - /// Code size it too large. - #[error("WASM code is {0} bytes, max allowed is {1}")] - CodeTooLarge(usize, usize), - /// Error decoding returned data. - #[error("Validation function returned invalid data.")] - BadReturn, - #[error("Validation function timeout.")] - Timeout, - #[error("External WASM execution error: {0}")] - ExternalWasmExecutor(String), -} - -impl core::convert::From for InvalidCandidate { - fn from(s: String) -> Self { - Self::ExternalWasmExecutor(s) - } -} - -/// Host error during candidate validation. This does not indicate an invalid candidate. -#[derive(Debug, thiserror::Error)] -pub enum InternalError { - #[error("IO error: {0}")] - Io(#[from] std::io::Error), - - #[error("System error: {0}")] - System(#[from] Box), - - #[cfg(not(any(target_os = "android", target_os = "unknown")))] - #[error("Failed to create shared memory: {0}")] - WorkerStartTimeout(String), - - #[cfg(not(any(target_os = "android", target_os = "unknown")))] - #[error("Failed to create shared memory: {0}")] - FailedToCreateSharedMemory(String), - - #[cfg(not(any(target_os = "android", target_os = "unknown")))] - #[error("Failed to send a singal to worker: {0}")] - FailedToSignal(String), - - #[cfg(not(any(target_os = "android", target_os = "unknown")))] - #[error("Failed to send data to worker: {0}")] - FailedToWriteData(&'static str), - - #[error("WASM worker error: {0}")] - WasmWorker(String), -} - -/// A cache of executors for different parachain Wasm instances. -/// -/// This should be reused across candidate validation instances. -pub struct ExecutorCache(sc_executor::WasmExecutor); - -impl ExecutorCache { - /// Returns a new instance of an executor cache. - /// - /// `cache_base_path` allows to specify a directory where the executor is allowed to store files - /// for caching, e.g. compilation artifacts. - pub fn new(cache_base_path: Option) -> ExecutorCache { - ExecutorCache(sc_executor::WasmExecutor::new( - #[cfg(all(feature = "wasmtime", not(any(target_os = "android", target_os = "unknown"))))] - sc_executor::WasmExecutionMethod::Compiled, - #[cfg(any(not(feature = "wasmtime"), target_os = "android", target_os = "unknown"))] - sc_executor::WasmExecutionMethod::Interpreted, - // TODO: Make sure we don't use more than 1GB: https://github.com/paritytech/polkadot/issues/699 - Some(1024), - HostFunctions::host_functions(), - 8, - cache_base_path, - )) - } -} - -/// Validate a candidate under the given validation code. -/// -/// This will fail if the validation code is not a proper parachain validation module. -pub fn validate_candidate( - validation_code: &[u8], - params: ValidationParams, - isolation_strategy: &IsolationStrategy, - spawner: impl SpawnNamed + 'static, -) -> Result { - match isolation_strategy { - IsolationStrategy::InProcess => { - validate_candidate_internal( - &ExecutorCache::new(None), - validation_code, - ¶ms.encode(), - spawner, - ) - }, - #[cfg(not(any(target_os = "android", target_os = "unknown")))] - IsolationStrategy::ExternalProcessSelfHost { pool, cache_base_path } => { - pool.validate_candidate(validation_code, params, cache_base_path.as_deref()) - }, - #[cfg(not(any(target_os = "android", target_os = "unknown")))] - IsolationStrategy::ExternalProcessCustomHost { pool, binary, args } => { - let args: Vec<&str> = args.iter().map(|x| x.as_str()).collect(); - pool.validate_candidate_custom(validation_code, params, binary, &args) - }, - } -} - -/// The host functions provided by the wasm executor to the parachain wasm blob. -type HostFunctions = sp_io::SubstrateHostFunctions; - -/// Validate a candidate under the given validation code. -/// -/// This will fail if the validation code is not a proper parachain validation module. -pub fn validate_candidate_internal( - executor: &ExecutorCache, - validation_code: &[u8], - encoded_call_data: &[u8], - spawner: impl SpawnNamed + 'static, -) -> Result { - let executor = &executor.0; - - let mut extensions = Extensions::new(); - extensions.register(sp_core::traits::TaskExecutorExt::new(spawner)); - extensions.register(sp_core::traits::CallInWasmExt::new(executor.clone())); - - let mut ext = ValidationExternalities(extensions); - - // Expensive, but not more-so than recompiling the wasm module. - // And we need this hash to access the `sc_executor` cache. - let code_hash = { - use polkadot_core_primitives::{BlakeTwo256, HashT}; - BlakeTwo256::hash(validation_code) - }; - - let res = executor.call_in_wasm( - validation_code, - Some(code_hash.as_bytes().to_vec()), - "validate_block", - encoded_call_data, - &mut ext, - sp_core::traits::MissingHostFunctions::Allow, - ).map_err(|e| ValidationError::InvalidCandidate(e.into()))?; - - ValidationResult::decode(&mut &res[..]) - .map_err(|_| ValidationError::InvalidCandidate(InvalidCandidate::BadReturn).into()) -} - -/// The validation externalities that will panic on any storage related access. They just provide -/// access to the parachain extension. -struct ValidationExternalities(Extensions); - -impl sp_externalities::Externalities for ValidationExternalities { - fn storage(&self, _: &[u8]) -> Option> { - panic!("storage: unsupported feature for parachain validation") - } - - fn storage_hash(&self, _: &[u8]) -> Option> { - panic!("storage_hash: unsupported feature for parachain validation") - } - - fn child_storage_hash(&self, _: &ChildInfo, _: &[u8]) -> Option> { - panic!("child_storage_hash: unsupported feature for parachain validation") - } - - fn child_storage(&self, _: &ChildInfo, _: &[u8]) -> Option> { - panic!("child_storage: unsupported feature for parachain validation") - } - - fn kill_child_storage(&mut self, _: &ChildInfo, _: Option) -> (bool, u32) { - panic!("kill_child_storage: unsupported feature for parachain validation") - } - - fn clear_prefix(&mut self, _: &[u8]) { - panic!("clear_prefix: unsupported feature for parachain validation") - } - - fn clear_child_prefix(&mut self, _: &ChildInfo, _: &[u8]) { - panic!("clear_child_prefix: unsupported feature for parachain validation") - } - - fn place_storage(&mut self, _: Vec, _: Option>) { - panic!("place_storage: unsupported feature for parachain validation") - } - - fn place_child_storage(&mut self, _: &ChildInfo, _: Vec, _: Option>) { - panic!("place_child_storage: unsupported feature for parachain validation") - } - - fn storage_root(&mut self) -> Vec { - panic!("storage_root: unsupported feature for parachain validation") - } - - fn child_storage_root(&mut self, _: &ChildInfo) -> Vec { - panic!("child_storage_root: unsupported feature for parachain validation") - } - - fn storage_changes_root(&mut self, _: &[u8]) -> Result>, ()> { - panic!("storage_changes_root: unsupported feature for parachain validation") - } - - fn next_child_storage_key(&self, _: &ChildInfo, _: &[u8]) -> Option> { - panic!("next_child_storage_key: unsupported feature for parachain validation") - } - - fn next_storage_key(&self, _: &[u8]) -> Option> { - panic!("next_storage_key: unsupported feature for parachain validation") - } - - fn storage_append( - &mut self, - _key: Vec, - _value: Vec, - ) { - panic!("storage_append: unsupported feature for parachain validation") - } - - fn storage_start_transaction(&mut self) { - panic!("storage_start_transaction: unsupported feature for parachain validation") - } - - fn storage_rollback_transaction(&mut self) -> Result<(), ()> { - panic!("storage_rollback_transaction: unsupported feature for parachain validation") - } - - fn storage_commit_transaction(&mut self) -> Result<(), ()> { - panic!("storage_commit_transaction: unsupported feature for parachain validation") - } - - fn wipe(&mut self) { - panic!("wipe: unsupported feature for parachain validation") - } - - fn commit(&mut self) { - panic!("commit: unsupported feature for parachain validation") - } - - fn read_write_count(&self) -> (u32, u32, u32, u32) { - panic!("read_write_count: unsupported feature for parachain validation") - } - - fn reset_read_write_count(&mut self) { - panic!("reset_read_write_count: unsupported feature for parachain validation") - } - - fn get_whitelist(&self) -> Vec { - panic!("get_whitelist: unsupported feature for parachain validation") - } - - fn set_whitelist(&mut self, _: Vec) { - panic!("set_whitelist: unsupported feature for parachain validation") - } - - fn set_offchain_storage(&mut self, _: &[u8], _: std::option::Option<&[u8]>) { - panic!("set_offchain_storage: unsupported feature for parachain validation") - } -} - -impl sp_externalities::ExtensionStore for ValidationExternalities { - fn extension_by_type_id(&mut self, type_id: TypeId) -> Option<&mut dyn Any> { - self.0.get_mut(type_id) - } - - fn register_extension_with_type_id( - &mut self, - type_id: TypeId, - extension: Box, - ) -> Result<(), sp_externalities::Error> { - self.0.register_with_type_id(type_id, extension) - } - - fn deregister_extension_by_type_id( - &mut self, - type_id: TypeId, - ) -> Result<(), sp_externalities::Error> { - if self.0.deregister(type_id) { - Ok(()) - } else { - Err(sp_externalities::Error::ExtensionIsNotRegistered(type_id)) - } - } -} diff --git a/parachain/src/wasm_executor/validation_host.rs b/parachain/src/wasm_executor/validation_host.rs deleted file mode 100644 index ccfba3b585b8..000000000000 --- a/parachain/src/wasm_executor/validation_host.rs +++ /dev/null @@ -1,357 +0,0 @@ -// Copyright 2019-2020 Parity Technologies (UK) Ltd. -// This file is part of Polkadot. - -// Polkadot is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. - -// Polkadot is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. - -// You should have received a copy of the GNU General Public License -// along with Polkadot. If not, see . - -#![cfg(not(any(target_os = "android", target_os = "unknown")))] - -use std::{env, path::PathBuf, process, sync::Arc, sync::atomic}; -use crate::primitives::{ValidationParams, ValidationResult}; -use super::{validate_candidate_internal, ValidationError, InvalidCandidate, InternalError}; -use parking_lot::Mutex; -use log::{debug, trace}; -use futures::executor::ThreadPool; -use sp_core::traits::SpawnNamed; - -const WORKER_ARG: &'static str = "validation-worker"; -/// CLI Argument to start in validation worker mode. -pub const WORKER_ARGS: &[&'static str] = &[WORKER_ARG]; - -const LOG_TARGET: &'static str = "parachain::validation-worker"; - -mod workspace; - -/// Execution timeout in seconds; -#[cfg(debug_assertions)] -pub const EXECUTION_TIMEOUT_SEC: u64 = 30; - -#[cfg(not(debug_assertions))] -pub const EXECUTION_TIMEOUT_SEC: u64 = 5; - -#[derive(Clone)] -struct TaskExecutor(ThreadPool); - -impl TaskExecutor { - fn new() -> Result { - ThreadPool::new().map_err(|e| e.to_string()).map(Self) - } -} - -impl SpawnNamed for TaskExecutor { - fn spawn_blocking(&self, _: &'static str, future: futures::future::BoxFuture<'static, ()>) { - self.0.spawn_ok(future); - } - - fn spawn(&self, _: &'static str, future: futures::future::BoxFuture<'static, ()>) { - self.0.spawn_ok(future); - } -} - -/// A pool of hosts. -#[derive(Clone, Debug)] -pub struct ValidationPool { - hosts: Arc>>, -} - -const DEFAULT_NUM_HOSTS: usize = 8; - -impl ValidationPool { - /// Creates a validation pool with the default configuration. - pub fn new() -> ValidationPool { - ValidationPool { - hosts: Arc::new((0..DEFAULT_NUM_HOSTS).map(|_| Default::default()).collect()), - } - } - - /// Validate a candidate under the given validation code using the next free validation host. - /// - /// This will fail if the validation code is not a proper parachain validation module. - /// - /// This function will use `std::env::current_exe()` with the arguments that consist of [`WORKER_ARGS`] - /// with appended `cache_base_path` (if any). - pub fn validate_candidate( - &self, - validation_code: &[u8], - params: ValidationParams, - cache_base_path: Option<&str>, - ) -> Result { - use std::{iter, borrow::Cow}; - - let worker_cli_args = match cache_base_path { - Some(cache_base_path) => { - let worker_cli_args: Vec<&str> = WORKER_ARGS - .into_iter() - .cloned() - .chain(iter::once(cache_base_path)) - .collect(); - Cow::from(worker_cli_args) - } - None => Cow::from(WORKER_ARGS), - }; - - self.validate_candidate_custom( - validation_code, - params, - &env::current_exe().map_err(|err| ValidationError::Internal(err.into()))?, - &worker_cli_args, - ) - } - - /// Validate a candidate under the given validation code using the next free validation host. - /// - /// This will fail if the validation code is not a proper parachain validation module. - /// - /// This function will use the command and the arguments provided in the function's arguments to run the worker. - pub fn validate_candidate_custom( - &self, - validation_code: &[u8], - params: ValidationParams, - command: &PathBuf, - args: &[&str], - ) -> Result { - for host in self.hosts.iter() { - if let Some(mut host) = host.try_lock() { - return host.validate_candidate(validation_code, params, command, args); - } - } - - // all workers are busy, just wait for the first one - self.hosts[0] - .lock() - .validate_candidate(validation_code, params, command, args) - } -} - -/// Validation worker process entry point. Runs a loop waiting for candidates to validate -/// and sends back results via shared memory. -pub fn run_worker(mem_id: &str, cache_base_path: Option) -> Result<(), String> { - let mut worker_handle = match workspace::open(mem_id) { - Err(e) => { - debug!( - target: LOG_TARGET, - "{} Error opening shared memory: {:?}", - process::id(), - e - ); - return Err(e); - } - Ok(h) => h, - }; - - let exit = Arc::new(atomic::AtomicBool::new(false)); - let task_executor = TaskExecutor::new()?; - // spawn parent monitor thread - let watch_exit = exit.clone(); - std::thread::spawn(move || { - use std::io::Read; - let mut in_data = Vec::new(); - // pipe terminates when parent process exits - std::io::stdin().read_to_end(&mut in_data).ok(); - debug!( - target: LOG_TARGET, - "{} Parent process is dead. Exiting", - process::id() - ); - exit.store(true, atomic::Ordering::Relaxed); - }); - - worker_handle.signal_ready()?; - - let executor = super::ExecutorCache::new(cache_base_path); - - loop { - if watch_exit.load(atomic::Ordering::Relaxed) { - break; - } - - debug!( - target: LOG_TARGET, - "{} Waiting for candidate", - process::id() - ); - let work_item = match worker_handle.wait_for_work(3) { - Err(workspace::WaitForWorkErr::Wait(e)) => { - trace!( - target: LOG_TARGET, - "{} Timeout waiting for candidate: {:?}", - process::id(), - e - ); - continue; - } - Err(workspace::WaitForWorkErr::FailedToDecode(e)) => { - return Err(e); - } - Ok(work_item) => work_item, - }; - - debug!(target: LOG_TARGET, "{} Processing candidate", process::id()); - let result = validate_candidate_internal( - &executor, - work_item.code, - work_item.params, - task_executor.clone(), - ); - - debug!( - target: LOG_TARGET, - "{} Candidate validated: {:?}", - process::id(), - result - ); - - let result_header = match result { - Ok(r) => workspace::ValidationResultHeader::Ok(r), - Err(ValidationError::Internal(e)) => workspace::ValidationResultHeader::Error( - workspace::WorkerValidationError::InternalError(e.to_string()), - ), - Err(ValidationError::InvalidCandidate(e)) => workspace::ValidationResultHeader::Error( - workspace::WorkerValidationError::ValidationError(e.to_string()), - ), - }; - worker_handle - .report_result(result_header) - .map_err(|e| format!("error reporting result: {:?}", e))?; - } - Ok(()) -} - -unsafe impl Send for ValidationHost {} - -#[derive(Default, Debug)] -struct ValidationHost { - worker: Option, - host_handle: Option, - id: u32, -} - -impl Drop for ValidationHost { - fn drop(&mut self) { - if let Some(ref mut worker) = &mut self.worker { - worker.kill().ok(); - } - } -} - -impl ValidationHost { - fn start_worker(&mut self, cmd: &PathBuf, args: &[&str]) -> Result<(), InternalError> { - if let Some(ref mut worker) = self.worker { - // Check if still alive - if let Ok(None) = worker.try_wait() { - // Still running - return Ok(()); - } - } - - let host_handle = - workspace::create().map_err(|msg| InternalError::FailedToCreateSharedMemory(msg))?; - - debug!( - target: LOG_TARGET, - "Starting worker at {:?} with arguments: {:?} and {:?}", - cmd, - args, - host_handle.id(), - ); - let worker = process::Command::new(cmd) - .args(args) - .arg(host_handle.id()) - .stdin(process::Stdio::piped()) - .spawn()?; - self.id = worker.id(); - self.worker = Some(worker); - - host_handle - .wait_until_ready(EXECUTION_TIMEOUT_SEC) - .map_err(|e| InternalError::WorkerStartTimeout(format!("{:?}", e)))?; - self.host_handle = Some(host_handle); - Ok(()) - } - - /// Validate a candidate under the given validation code. - /// - /// This will fail if the validation code is not a proper parachain validation module. - pub fn validate_candidate( - &mut self, - validation_code: &[u8], - params: ValidationParams, - binary: &PathBuf, - args: &[&str], - ) -> Result { - // First, check if need to spawn the child process - self.start_worker(binary, args)?; - - let host_handle = self - .host_handle - .as_mut() - .expect("host_handle is always `Some` after `start_worker` completes successfully"); - - debug!(target: LOG_TARGET, "{} Signaling candidate", self.id); - match host_handle.request_validation(validation_code, params) { - Ok(()) => {} - Err(workspace::RequestValidationErr::CodeTooLarge { actual, max }) => { - return Err(ValidationError::InvalidCandidate( - InvalidCandidate::CodeTooLarge(actual, max), - )); - } - Err(workspace::RequestValidationErr::ParamsTooLarge { actual, max }) => { - return Err(ValidationError::InvalidCandidate( - InvalidCandidate::ParamsTooLarge(actual, max), - )); - } - Err(workspace::RequestValidationErr::Signal(msg)) => { - return Err(ValidationError::Internal(InternalError::FailedToSignal(msg))); - } - Err(workspace::RequestValidationErr::WriteData(msg)) => { - return Err(ValidationError::Internal(InternalError::FailedToWriteData(msg))); - } - } - - debug!(target: LOG_TARGET, "{} Waiting for results", self.id); - let result_header = match host_handle.wait_for_result(EXECUTION_TIMEOUT_SEC) { - Ok(inner_result) => inner_result, - Err(assumed_timeout) => { - debug!(target: LOG_TARGET, "Worker timeout: {:?}", assumed_timeout); - if let Some(mut worker) = self.worker.take() { - worker.kill().ok(); - } - return Err(ValidationError::InvalidCandidate(InvalidCandidate::Timeout)); - } - }; - - match result_header { - workspace::ValidationResultHeader::Ok(result) => Ok(result), - workspace::ValidationResultHeader::Error( - workspace::WorkerValidationError::InternalError(e), - ) => { - debug!( - target: LOG_TARGET, - "{} Internal validation error: {}", self.id, e - ); - Err(ValidationError::Internal(InternalError::WasmWorker(e))) - } - workspace::ValidationResultHeader::Error( - workspace::WorkerValidationError::ValidationError(e), - ) => { - debug!( - target: LOG_TARGET, - "{} External validation error: {}", self.id, e - ); - Err(ValidationError::InvalidCandidate( - InvalidCandidate::ExternalWasmExecutor(e), - )) - } - } - } -} diff --git a/parachain/src/wasm_executor/validation_host/workspace.rs b/parachain/src/wasm_executor/validation_host/workspace.rs deleted file mode 100644 index d8822f87a8e6..000000000000 --- a/parachain/src/wasm_executor/validation_host/workspace.rs +++ /dev/null @@ -1,614 +0,0 @@ -// Copyright 2021 Parity Technologies (UK) Ltd. -// This file is part of Polkadot. - -// Polkadot is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. - -// Polkadot is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. - -// You should have received a copy of the GNU General Public License -// along with Polkadot. If not, see . - -//! This module implements a "workspace" - basically a wrapper around a shared memory that -//! is used as an IPC channel for communication between the validation host and it's validation -//! worker. - -use crate::primitives::{ValidationParams, ValidationResult}; -use super::LOG_TARGET; -use parity_scale_codec::{Decode, Encode}; -use raw_sync::{ - events::{Event, EventImpl, EventInit, EventState}, - Timeout, -}; -use shared_memory::{Shmem, ShmemConf}; -use std::{ - error::Error, - fmt, - io::{Cursor, Write}, - slice, - sync::atomic::AtomicBool, - time::Duration, -}; - -// maximum memory in bytes -const MAX_PARAMS_MEM: usize = 16 * 1024 * 1024; // 16 MiB -const MAX_CODE_MEM: usize = 16 * 1024 * 1024; // 16 MiB - -/// The size of the shared workspace region. The maximum amount -const SHARED_WORKSPACE_SIZE: usize = MAX_PARAMS_MEM + MAX_CODE_MEM + (1024 * 1024); - -/// Params header in shared memory. All offsets should be aligned to WASM page size. -#[derive(Encode, Decode, Debug)] -struct ValidationHeader { - code_size: u64, - params_size: u64, -} - -/// An error that could happen during validation of a candidate. -#[derive(Encode, Decode, Debug, PartialEq, Eq, Clone)] -pub enum WorkerValidationError { - InternalError(String), - ValidationError(String), -} - -/// An enum that is used to marshal a validation result in order to pass it through the shared memory. -#[derive(Encode, Decode, Debug, PartialEq, Eq, Clone)] -pub enum ValidationResultHeader { - Ok(ValidationResult), - Error(WorkerValidationError), -} - -#[derive(Debug, PartialEq, Eq, Copy, Clone)] -enum Mode { - Initialize, - Attach, -} - -fn stringify_err(err: Box) -> String { - format!("{:?}", err) -} - -struct Inner { - shmem: Shmem, - candidate_ready_ev: Box, - result_ready_ev: Box, - worker_ready_ev: Box, - - /// Flag that indicates that the worker side is attached to this workspace. - /// - /// While there are apparent problems attaching multiple workers to the same workspace, we don't - /// need that anyway. So to make our reasoning a little bit simpler just add a flag and check - /// it before attaching. - attached: *mut AtomicBool, - - /// The number of bytes reserved by the auxilary stuff like events from the beginning of the - /// shared memory area. - /// - /// We expect this to be way smaller than the whole shmem size. - consumed: usize, -} - -impl Inner { - fn layout(shmem: Shmem, mode: Mode) -> Self { - unsafe { - let base_ptr = shmem.as_ptr(); - let mut consumed = 0; - - let candidate_ready_ev = add_event(base_ptr, &mut consumed, mode); - let result_ready_ev = add_event(base_ptr, &mut consumed, mode); - let worker_ready_ev = add_event(base_ptr, &mut consumed, mode); - - // The size of AtomicBool is guaranteed to be the same as the bool, however, docs - // on the bool primitve doesn't actually state that the in-memory size is equal to 1 byte. - // - // AtomicBool requires hardware support of 1 byte width of atomic operations though, so - // that should be fine. - // - // We still assert here to be safe than sorry. - static_assertions::assert_eq_size!(AtomicBool, u8); - // SAFETY: `AtomicBool` is represented by an u8 thus will be happy to take any alignment. - let attached = base_ptr.add(consumed) as *mut AtomicBool; - consumed += 1; - - let consumed = align_up_to(consumed, 64); - - Self { - shmem, - attached, - consumed, - candidate_ready_ev, - result_ready_ev, - worker_ready_ev, - } - } - } - - fn as_slice(&self) -> &[u8] { - unsafe { - let base_ptr = self.shmem.as_ptr().add(self.consumed); - let remaining = self.shmem.len() - self.consumed; - slice::from_raw_parts(base_ptr, remaining) - } - } - - fn as_slice_mut(&mut self) -> &mut [u8] { - unsafe { - let base_ptr = self.shmem.as_ptr().add(self.consumed); - let remaining = self.shmem.len() - self.consumed; - slice::from_raw_parts_mut(base_ptr, remaining) - } - } - - /// Mark that this workspace has an attached worker already. Returning `true` means that this - /// was the first worker attached. - fn declare_exclusive_attached(&self) -> bool { - unsafe { - // If this succeeded then the value was `false`, thus, we managed to attach exclusively. - (&*self.attached) - .compare_exchange_weak( - false, - true, - std::sync::atomic::Ordering::SeqCst, - std::sync::atomic::Ordering::SeqCst, - ) - .is_ok() - } - } -} - -fn align_up_to(v: usize, alignment: usize) -> usize { - ((v + alignment - 1) / alignment) * alignment -} - -/// Initializes a new or attaches to an exising event. -/// -/// # Safety -/// -/// This function should be called with the combination of `base_ptr` and `consumed` so that `base_ptr + consumed` -/// points on the memory area that is allocated and accessible. -/// -/// This function should be called only once for the same combination of the `base_ptr + consumed` and the mode. -/// Furthermore, this function should be called once for initialization. -/// -/// Specifically, `consumed` should not be modified by the caller, it should be passed as is to this function. -unsafe fn add_event(base_ptr: *mut u8, consumed: &mut usize, mode: Mode) -> Box { - // SAFETY: there is no safety proof since the documentation doesn't specify the particular constraints - // besides requiring the pointer to be valid. AFAICT, the pointer is valid. - let ptr = base_ptr.add(*consumed); - - const EXPECTATION: &str = - "given that the preconditions were fulfilled, the creation of the event should succeed"; - let (ev, used_bytes) = match mode { - Mode::Initialize => Event::new(ptr, true).expect(EXPECTATION), - Mode::Attach => Event::from_existing(ptr).expect(EXPECTATION), - }; - *consumed += used_bytes; - ev -} - -/// A message received by the worker that specifies a candidate validation work. -pub struct WorkItem<'handle> { - pub params: &'handle [u8], - pub code: &'handle [u8], -} - -/// An error that could be returned from [`WorkerHandle::wait_for_work`]. -#[derive(Debug)] -pub enum WaitForWorkErr { - /// An error occured during waiting for work. Typically a timeout. - Wait(String), - /// An error ocurred when trying to decode the validation request from the host. - FailedToDecode(String), -} - -/// An error that could be returned from [`WorkerHandle::report_result`]. -#[derive(Debug)] -pub enum ReportResultErr { - /// An error occured during signalling to the host that the result is ready. - Signal(String), -} - -/// A worker side handle to a workspace. -pub struct WorkerHandle { - inner: Inner, -} - -impl WorkerHandle { - /// Signals to the validation host that this worker is ready to accept new work requests. - pub fn signal_ready(&self) -> Result<(), String> { - self.inner - .worker_ready_ev - .set(EventState::Signaled) - .map_err(stringify_err)?; - Ok(()) - } - - /// Waits until a new piece of work. Returns `Err` if the work doesn't come within the given - /// timeout. - pub fn wait_for_work(&mut self, timeout_secs: u64) -> Result { - self.inner - .candidate_ready_ev - .wait(Timeout::Val(Duration::from_secs(timeout_secs))) - .map_err(stringify_err) - .map_err(WaitForWorkErr::Wait)?; - - let mut cur = self.inner.as_slice(); - let header = ValidationHeader::decode(&mut cur) - .map_err(|e| format!("{:?}", e)) - .map_err(WaitForWorkErr::FailedToDecode)?; - - let (params, cur) = cur.split_at(header.params_size as usize); - let (code, _) = cur.split_at(header.code_size as usize); - - Ok(WorkItem { params, code }) - } - - /// Report back the result of validation. - pub fn report_result(&mut self, result: ValidationResultHeader) -> Result<(), ReportResultErr> { - let mut cur = self.inner.as_slice_mut(); - result.encode_to(&mut cur); - self.inner - .result_ready_ev - .set(EventState::Signaled) - .map_err(stringify_err) - .map_err(ReportResultErr::Signal)?; - - Ok(()) - } -} - -/// An error that could be returned from [`HostHandle::wait_until_ready`]. -#[derive(Debug)] -pub enum WaitUntilReadyErr { - /// An error occured during waiting for the signal from the worker. - Wait(String), -} - -/// An error that could be returned from [`HostHandle::request_validation`]. -#[derive(Debug)] -pub enum RequestValidationErr { - /// The code passed exceeds the maximum allowed limit. - CodeTooLarge { actual: usize, max: usize }, - /// The call parameters exceed the maximum allowed limit. - ParamsTooLarge { actual: usize, max: usize }, - /// An error occured during writing either the code or the call params (the inner string specifies which) - WriteData(&'static str), - /// An error occured during signalling that the request is ready. - Signal(String), -} - -/// An error that could be returned from [`HostHandle::wait_for_result`] -#[derive(Debug)] -pub enum WaitForResultErr { - /// A error happened during waiting for the signal. Typically a timeout. - Wait(String), - /// Failed to decode the result header sent by the worker. - HeaderDecodeErr(String), -} - -/// A worker side handle to a workspace. -pub struct HostHandle { - inner: Inner, -} - -impl fmt::Debug for HostHandle { - fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - write!(f, "HostHandle") - } -} - -impl HostHandle { - /// Returns the OS specific ID for this workspace. - pub fn id(&self) -> &str { - self.inner.shmem.get_os_id() - } - - /// Wait until the worker is online and ready for accepting validation requests. - pub fn wait_until_ready(&self, timeout_secs: u64) -> Result<(), WaitUntilReadyErr> { - self.inner - .worker_ready_ev - .wait(Timeout::Val(Duration::from_secs(timeout_secs))) - .map_err(stringify_err) - .map_err(WaitUntilReadyErr::Wait)?; - Ok(()) - } - - /// Request validation with the given code and parameters. - pub fn request_validation( - &mut self, - code: &[u8], - params: ValidationParams, - ) -> Result<(), RequestValidationErr> { - if code.len() > MAX_CODE_MEM { - return Err(RequestValidationErr::CodeTooLarge { - actual: code.len(), - max: MAX_CODE_MEM, - }); - } - - let params = params.encode(); - if params.len() > MAX_PARAMS_MEM { - return Err(RequestValidationErr::ParamsTooLarge { - actual: params.len(), - max: MAX_PARAMS_MEM, - }); - } - - let mut cur = Cursor::new(self.inner.as_slice_mut()); - ValidationHeader { - code_size: code.len() as u64, - params_size: params.len() as u64, - } - .encode_to(&mut cur); - cur.write_all(¶ms) - .map_err(|_| RequestValidationErr::WriteData("params"))?; - cur.write_all(code) - .map_err(|_| RequestValidationErr::WriteData("code"))?; - - self.inner - .candidate_ready_ev - .set(EventState::Signaled) - .map_err(stringify_err) - .map_err(RequestValidationErr::Signal)?; - - Ok(()) - } - - /// Wait for the validation result from the worker with the given timeout. - /// - /// Returns `Ok` if the response was received within the deadline or error otherwise. An error - /// could also occur because of failing decoding the result from the worker. Returning - /// `Ok` doesn't mean that the candidate was successfully validated though, for that the client - /// needs to inspect the returned validation result header. - pub fn wait_for_result( - &self, - execution_timeout: u64, - ) -> Result { - self.inner - .result_ready_ev - .wait(Timeout::Val(Duration::from_secs(execution_timeout))) - .map_err(|e| WaitForResultErr::Wait(format!("{:?}", e)))?; - - let mut cur = self.inner.as_slice(); - let header = ValidationResultHeader::decode(&mut cur) - .map_err(|e| WaitForResultErr::HeaderDecodeErr(format!("{:?}", e)))?; - Ok(header) - } -} - -/// Create a new workspace and return a handle to it. -pub fn create() -> Result { - let shmem = ShmemConf::new() - .size(SHARED_WORKSPACE_SIZE) - .create() - .map_err(|e| format!("Error creating shared memory: {:?}", e))?; - - Ok(HostHandle { - inner: Inner::layout(shmem, Mode::Initialize), - }) -} - -/// Open a workspace with the given `id`. -/// -/// You can attach only once to a single workspace. -pub fn open(id: &str) -> Result { - let shmem = ShmemConf::new() - .os_id(id) - .open() - .map_err(|e| format!("Error opening shared memory: {:?}", e))?; - - #[cfg(unix)] - unlink_shmem(&id); - - let inner = Inner::layout(shmem, Mode::Attach); - if !inner.declare_exclusive_attached() { - return Err(format!("The workspace has been already attached to")); - } - - return Ok(WorkerHandle { inner }); - - #[cfg(unix)] - fn unlink_shmem(shmem_id: &str) { - // Unlink the shmem. Unlinking it from the filesystem will make it unaccessible for further - // opening, however, the kernel will still let the object live until the last reference dies - // out. - // - // There is still a chance that the shm stays on the fs, but that's a highly unlikely case - // that we don't address at this time. - - // shared-memory doesn't return file path to the shmem if get_flink_path is called, so we - // resort to `shm_unlink`. - // - // Additionally, even thouygh `fs::remove_file` is said to use `unlink` we still avoid relying on it, - // because the stdlib doesn't actually provide any gurantees on what syscalls will be called. - // (Not sure, what alternative it has though). - unsafe { - // must be in a local var in order to be not deallocated. - let shmem_id_cstr = - std::ffi::CString::new(shmem_id).expect("the shmmem id cannot have NUL in it; qed"); - - if libc::shm_unlink(shmem_id_cstr.as_ptr()) == -1 { - // failed to remove the shmem file nothing we can do ¯\_(ツ)_/¯ - log::warn!( - target: LOG_TARGET, - "failed to remove the shmem with id {}", - shmem_id, - ); - } - } - } -} - -#[cfg(test)] -mod tests { - use polkadot_core_primitives::OutboundHrmpMessage; - - use crate::primitives::BlockData; - - use super::*; - use std::thread; - - #[test] - fn wait_until_ready() { - let host = create().unwrap(); - - let worker_handle = thread::spawn({ - let id = host.id().to_string(); - move || { - let worker = open(&id).unwrap(); - worker.signal_ready().unwrap(); - } - }); - - host.wait_until_ready(1).unwrap(); - - worker_handle.join().unwrap(); - } - - #[test] - fn wait_until_ready_timeout() { - let host = create().unwrap(); - - let _worker_handle = thread::spawn({ - let id = host.id().to_string(); - move || { - let _worker = open(&id).unwrap(); - } - }); - - assert!(matches!( - host.wait_until_ready(1), - Err(WaitUntilReadyErr::Wait(_)) - )); - } - - #[test] - fn open_junk_id() { - assert!(open("").is_err()); - assert!(open("non_existent").is_err()); - assert!(open("☭").is_err()); - } - - #[test] - fn attach_twice() { - let host = create().unwrap(); - - thread::spawn({ - let id = host.id().to_string(); - move || { - let _worker1 = open(&id).unwrap(); - assert!(open(&id).is_err()); - } - }); - } - - #[test] - fn validation_works() { - let mut host = create().unwrap(); - - let worker_handle = thread::spawn({ - let id = host.id().to_string(); - move || { - let mut worker = open(&id).unwrap(); - worker.signal_ready().unwrap(); - - let work = worker.wait_for_work(3).unwrap(); - assert_eq!(work.code, b"\0asm\01\00\00\00"); - - worker - .report_result(ValidationResultHeader::Ok(ValidationResult { - head_data: Default::default(), - new_validation_code: None, - upward_messages: vec![], - horizontal_messages: vec![], - processed_downward_messages: 322, - hrmp_watermark: 0, - })) - .unwrap(); - } - }); - - host.wait_until_ready(1).unwrap(); - host.request_validation( - b"\0asm\01\00\00\00", - ValidationParams { - parent_head: Default::default(), - block_data: BlockData(b"hello world".to_vec()), - relay_parent_number: 228, - relay_parent_storage_root: Default::default(), - }, - ) - .unwrap(); - - match host.wait_for_result(3).unwrap() { - ValidationResultHeader::Ok(r) => { - assert_eq!(r.processed_downward_messages, 322); - } - _ => panic!(), - } - - worker_handle.join().unwrap(); - } - - #[test] - fn works_with_jumbo_sized_params() { - let mut host = create().unwrap(); - - let jumbo_code = vec![0x42; 16 * 1024 * 104]; - let fat_pov = vec![0x33; 16 * 1024 * 104]; - let big_params = ValidationParams { - parent_head: Default::default(), - block_data: BlockData(fat_pov), - relay_parent_number: 228, - relay_parent_storage_root: Default::default(), - // If modifying please make sure that this has a big size. - }; - let plump_result = ValidationResultHeader::Ok(ValidationResult { - head_data: Default::default(), - new_validation_code: Some(jumbo_code.clone().into()), - processed_downward_messages: 322, - hrmp_watermark: 0, - // We don't know about the limits here. Just make sure that those are reasonably big. - upward_messages: fill(|| vec![0x99; 8 * 1024], 64), - horizontal_messages: fill( - || OutboundHrmpMessage { - recipient: 1.into(), - data: vec![0x11; 8 * 1024], - }, - 64, - ), - // If modifying please make sure that this has a big size. - }); - - let _worker_handle = thread::spawn({ - let id = host.id().to_string(); - let jumbo_code = jumbo_code.clone(); - let big_params = big_params.clone(); - let plump_result = plump_result.clone(); - move || { - let mut worker = open(&id).unwrap(); - worker.signal_ready().unwrap(); - - let work = worker.wait_for_work(3).unwrap(); - assert_eq!(work.code, &jumbo_code); - assert_eq!(work.params, &big_params.encode()); - - worker.report_result(plump_result).unwrap(); - } - }); - - host.wait_until_ready(1).unwrap(); - host.request_validation(&jumbo_code, big_params).unwrap(); - - assert_eq!(host.wait_for_result(3).unwrap(), plump_result); - - fn fill T>(f: F, times: usize) -> Vec { - std::iter::repeat_with(f).take(times).collect() - } - } -} diff --git a/parachain/test-parachains/adder/collator/Cargo.toml b/parachain/test-parachains/adder/collator/Cargo.toml index dfe303f2dcb4..7b39707934fa 100644 --- a/parachain/test-parachains/adder/collator/Cargo.toml +++ b/parachain/test-parachains/adder/collator/Cargo.toml @@ -9,6 +9,10 @@ edition = "2018" name = "adder-collator" path = "src/main.rs" +[[bin]] +name = "adder_collator_puppet_worker" +path = "bin/puppet_worker.rs" + [dependencies] parity-scale-codec = { version = "2.0.0", default-features = false, features = ["derive"] } futures = "0.3.12" @@ -28,6 +32,11 @@ sp-core = { git = "https://github.com/paritytech/substrate", branch = "master" } sc-authority-discovery = { git = "https://github.com/paritytech/substrate", branch = "master" } sc-service = { git = "https://github.com/paritytech/substrate", branch = "master" } +# This one is tricky. Even though it is not used directly by the collator, we still need it for the +# `puppet_worker` binary, which is required for the integration test. However, this shouldn't be +# a big problem since it is used transitively anyway. +polkadot-node-core-pvf = { path = "../../../../node/core/pvf" } + [dev-dependencies] polkadot-parachain = { path = "../../.." } polkadot-test-service = { path = "../../../../node/test/service" } diff --git a/parachain/test-parachains/adder/collator/bin/puppet_worker.rs b/parachain/test-parachains/adder/collator/bin/puppet_worker.rs new file mode 100644 index 000000000000..4b026e96a809 --- /dev/null +++ b/parachain/test-parachains/adder/collator/bin/puppet_worker.rs @@ -0,0 +1,17 @@ +// Copyright 2021 Parity Technologies (UK) Ltd. +// This file is part of Polkadot. + +// Polkadot is free software: you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation, either version 3 of the License, or +// (at your option) any later version. + +// Polkadot is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. + +// You should have received a copy of the GNU General Public License +// along with Polkadot. If not, see . + +polkadot_node_core_pvf::decl_puppet_worker_main!(); diff --git a/parachain/test-parachains/adder/collator/src/lib.rs b/parachain/test-parachains/adder/collator/src/lib.rs index 03724ea79d99..80510a189d0a 100644 --- a/parachain/test-parachains/adder/collator/src/lib.rs +++ b/parachain/test-parachains/adder/collator/src/lib.rs @@ -233,7 +233,7 @@ mod tests { use super::*; use futures::executor::block_on; - use polkadot_parachain::{primitives::ValidationParams, wasm_executor::IsolationStrategy}; + use polkadot_parachain::{primitives::{ValidationParams, ValidationResult}}; use polkadot_primitives::v1::PersistedValidationData; #[test] @@ -268,18 +268,19 @@ mod tests { parent_head: HeadData, collation: Collation, ) { - let ret = polkadot_parachain::wasm_executor::validate_candidate( + use polkadot_node_core_pvf::testing::validate_candidate; + + let ret_buf = validate_candidate( collator.validation_code(), - ValidationParams { + &ValidationParams { parent_head: parent_head.encode().into(), block_data: collation.proof_of_validity.block_data, relay_parent_number: 1, relay_parent_storage_root: Default::default(), - }, - &IsolationStrategy::InProcess, - sp_core::testing::TaskExecutor::new(), + }.encode(), ) .unwrap(); + let ret = ValidationResult::decode(&mut &ret_buf[..]).unwrap(); let new_head = HeadData::decode(&mut &ret.head_data.0[..]).unwrap(); assert_eq!( diff --git a/parachain/test-parachains/adder/collator/tests/integration.rs b/parachain/test-parachains/adder/collator/tests/integration.rs index e0f6d2d28ad9..574a9abf84df 100644 --- a/parachain/test-parachains/adder/collator/tests/integration.rs +++ b/parachain/test-parachains/adder/collator/tests/integration.rs @@ -17,6 +17,9 @@ //! Integration test that ensures that we can build and include parachain //! blocks of the adder parachain. +const PUPPET_EXE: &str = env!("CARGO_BIN_EXE_adder_collator_puppet_worker"); + +// If this test is failing, make sure to run all tests with the `real-overseer` feature being enabled. #[substrate_test_utils::test] async fn collating_using_adder_collator(task_executor: sc_service::TaskExecutor) { use sp_keyring::AccountKeyring::*; @@ -30,7 +33,12 @@ async fn collating_using_adder_collator(task_executor: sc_service::TaskExecutor) let para_id = ParaId::from(100); // start alice - let alice = polkadot_test_service::run_validator_node(task_executor.clone(), Alice, || {}, vec![]); + let alice = polkadot_test_service::run_validator_node( + task_executor.clone(), + Alice, || {}, + vec![], + Some(PUPPET_EXE.into()), + ); // start bob let bob = polkadot_test_service::run_validator_node( @@ -38,6 +46,7 @@ async fn collating_using_adder_collator(task_executor: sc_service::TaskExecutor) Bob, || {}, vec![alice.addr.clone()], + Some(PUPPET_EXE.into()), ); let collator = test_parachain_adder_collator::Collator::new(); diff --git a/parachain/test-parachains/tests/adder/mod.rs b/parachain/test-parachains/tests/adder/mod.rs deleted file mode 100644 index 2dc581208106..000000000000 --- a/parachain/test-parachains/tests/adder/mod.rs +++ /dev/null @@ -1,153 +0,0 @@ -// Copyright 2017-2020 Parity Technologies (UK) Ltd. -// This file is part of Polkadot. - -// Polkadot is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. - -// Polkadot is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. - -// You should have received a copy of the GNU General Public License -// along with Polkadot. If not, see . - -//! Basic parachain that adds a number as part of its state. - -const WORKER_ARGS_TEST: &[&'static str] = &["--nocapture", "validation_worker"]; - -use parachain::{ - primitives::{ - RelayChainBlockNumber, - BlockData as GenericBlockData, - HeadData as GenericHeadData, - ValidationParams, - }, - wasm_executor::{ValidationPool, IsolationStrategy} -}; -use parity_scale_codec::{Decode, Encode}; -use adder::{HeadData, BlockData, hash_state}; - -fn isolation_strategy() -> IsolationStrategy { - IsolationStrategy::ExternalProcessCustomHost { - pool: ValidationPool::new(), - binary: std::env::current_exe().unwrap(), - args: WORKER_ARGS_TEST.iter().map(|x| x.to_string()).collect(), - } -} - -#[test] -fn execute_good_on_parent_with_inprocess_validation() { - let isolation_strategy = IsolationStrategy::InProcess; - execute_good_on_parent(isolation_strategy); -} - -#[test] -pub fn execute_good_on_parent_with_external_process_validation() { - let isolation_strategy = isolation_strategy(); - execute_good_on_parent(isolation_strategy); -} - -fn execute_good_on_parent(isolation_strategy: IsolationStrategy) { - let parent_head = HeadData { - number: 0, - parent_hash: [0; 32], - post_state: hash_state(0), - }; - - let block_data = BlockData { - state: 0, - add: 512, - }; - - let ret = parachain::wasm_executor::validate_candidate( - adder::wasm_binary_unwrap(), - ValidationParams { - parent_head: GenericHeadData(parent_head.encode()), - block_data: GenericBlockData(block_data.encode()), - relay_parent_number: 1, - relay_parent_storage_root: Default::default(), - }, - &isolation_strategy, - sp_core::testing::TaskExecutor::new(), - ).unwrap(); - - let new_head = HeadData::decode(&mut &ret.head_data.0[..]).unwrap(); - - assert_eq!(new_head.number, 1); - assert_eq!(new_head.parent_hash, parent_head.hash()); - assert_eq!(new_head.post_state, hash_state(512)); -} - -#[test] -fn execute_good_chain_on_parent() { - let mut number = 0; - let mut parent_hash = [0; 32]; - let mut last_state = 0; - let isolation_strategy = isolation_strategy(); - - for add in 0..10 { - let parent_head = HeadData { - number, - parent_hash, - post_state: hash_state(last_state), - }; - - let block_data = BlockData { - state: last_state, - add, - }; - - let ret = parachain::wasm_executor::validate_candidate( - adder::wasm_binary_unwrap(), - ValidationParams { - parent_head: GenericHeadData(parent_head.encode()), - block_data: GenericBlockData(block_data.encode()), - relay_parent_number: number as RelayChainBlockNumber + 1, - relay_parent_storage_root: Default::default(), - }, - &isolation_strategy, - sp_core::testing::TaskExecutor::new(), - ).unwrap(); - - let new_head = HeadData::decode(&mut &ret.head_data.0[..]).unwrap(); - - assert_eq!(new_head.number, number + 1); - assert_eq!(new_head.parent_hash, parent_head.hash()); - assert_eq!(new_head.post_state, hash_state(last_state + add)); - - number += 1; - parent_hash = new_head.hash(); - last_state += add; - } -} - -#[test] -fn execute_bad_on_parent() { - let isolation_strategy = isolation_strategy(); - - let parent_head = HeadData { - number: 0, - parent_hash: [0; 32], - post_state: hash_state(0), - }; - - let block_data = BlockData { - state: 256, // start state is wrong. - add: 256, - }; - - let _ret = parachain::wasm_executor::validate_candidate( - adder::wasm_binary_unwrap(), - ValidationParams { - parent_head: GenericHeadData(parent_head.encode()), - block_data: GenericBlockData(block_data.encode()), - relay_parent_number: 1, - relay_parent_storage_root: Default::default(), - }, - &isolation_strategy, - sp_core::testing::TaskExecutor::new(), - ).unwrap_err(); -} diff --git a/parachain/test-parachains/tests/wasm_executor/mod.rs b/parachain/test-parachains/tests/wasm_executor/mod.rs deleted file mode 100644 index a034ae5908f0..000000000000 --- a/parachain/test-parachains/tests/wasm_executor/mod.rs +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright 2019-2020 Parity Technologies (UK) Ltd. -// This file is part of Polkadot. - -// Polkadot is free software: you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation, either version 3 of the License, or -// (at your option) any later version. - -// Polkadot is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. - -// You should have received a copy of the GNU General Public License -// along with Polkadot. If not, see . - -//! Basic parachain that adds a number as part of its state. - -const WORKER_ARGS_TEST: &[&'static str] = &["--nocapture", "validation_worker"]; - -use crate::adder; -use parachain::{ - primitives::{BlockData, ValidationParams}, - wasm_executor::{ValidationError, InvalidCandidate, EXECUTION_TIMEOUT_SEC, IsolationStrategy, ValidationPool}, -}; - -fn isolation_strategy() -> IsolationStrategy { - IsolationStrategy::ExternalProcessCustomHost { - pool: ValidationPool::new(), - binary: std::env::current_exe().unwrap(), - args: WORKER_ARGS_TEST.iter().map(|x| x.to_string()).collect(), - } -} - -#[test] -fn terminates_on_timeout() { - let isolation_strategy = isolation_strategy(); - - let result = parachain::wasm_executor::validate_candidate( - halt::wasm_binary_unwrap(), - ValidationParams { - block_data: BlockData(Vec::new()), - parent_head: Default::default(), - relay_parent_number: 1, - relay_parent_storage_root: Default::default(), - }, - &isolation_strategy, - sp_core::testing::TaskExecutor::new(), - ); - match result { - Err(ValidationError::InvalidCandidate(InvalidCandidate::Timeout)) => {}, - r => panic!("{:?}", r), - } - - // check that another parachain can validate normaly - adder::execute_good_on_parent_with_external_process_validation(); -} - -#[test] -fn parallel_execution() { - let isolation_strategy = isolation_strategy(); - let isolation_strategy_clone = isolation_strategy.clone(); - - let start = std::time::Instant::now(); - let thread = std::thread::spawn(move || - parachain::wasm_executor::validate_candidate( - halt::wasm_binary_unwrap(), - ValidationParams { - block_data: BlockData(Vec::new()), - parent_head: Default::default(), - relay_parent_number: 1, - relay_parent_storage_root: Default::default(), - }, - &isolation_strategy, - sp_core::testing::TaskExecutor::new(), - ).ok()); - let _ = parachain::wasm_executor::validate_candidate( - halt::wasm_binary_unwrap(), - ValidationParams { - block_data: BlockData(Vec::new()), - parent_head: Default::default(), - relay_parent_storage_root: Default::default(), - relay_parent_number: 1, - }, - &isolation_strategy_clone, - sp_core::testing::TaskExecutor::new(), - ); - thread.join().unwrap(); - // total time should be < 2 x EXECUTION_TIMEOUT_SEC - assert!( - std::time::Instant::now().duration_since(start) - < std::time::Duration::from_secs(EXECUTION_TIMEOUT_SEC * 2) - ); -}