Skip to content
This repository has been archived by the owner on Nov 15, 2023. It is now read-only.

pvf: ensure enough stack space #5712

Merged
merged 3 commits into from
Jun 24, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions node/core/pvf/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ gum = { package = "tracing-gum", path = "../../gum" }
pin-project = "1.0.9"
rand = "0.8.5"
tempfile = "3.3.0"
crossbeam-utils = "0.8.9"
ordian marked this conversation as resolved.
Show resolved Hide resolved
parity-scale-codec = { version = "3.1.2", default-features = false, features = ["derive"] }
polkadot-parachain = { path = "../../../parachain" }
polkadot-core-primitives = { path = "../../../core-primitives" }
Expand Down
15 changes: 15 additions & 0 deletions node/core/pvf/src/error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
// along with Polkadot. If not, see <http://www.gnu.org/licenses/>.

use parity_scale_codec::{Decode, Encode};
use std::any::Any;

/// Result of PVF preparation performed by the validation host.
pub type PrepareResult = Result<(), PrepareError>;
Expand Down Expand Up @@ -108,3 +109,17 @@ impl From<PrepareError> for ValidationError {
}
}
}

/// Attempt to convert an opaque panic payload to a string.
///
/// This is a best effort, and is not guaranteed to provide the most accurate value.
pub(crate) fn stringify_panic_payload(payload: Box<dyn Any + Send + 'static>) -> String {
match payload.downcast::<&'static str>() {
Ok(msg) => msg.to_string(),
Err(payload) => match payload.downcast::<String>() {
Ok(msg) => *msg,
// At least we tried...
Err(_) => "unkown panic payload".to_string(),
pepyakin marked this conversation as resolved.
Show resolved Hide resolved
},
}
}
73 changes: 72 additions & 1 deletion node/core/pvf/src/executor_intf.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,9 @@ use std::{
const DEFAULT_HEAP_PAGES_ESTIMATE: u64 = 32;
const EXTRA_HEAP_PAGES: u64 = 2048;

/// The number of bytes devoted for the stack during wasm execution of a PVF.
const NATIVE_STACK_MAX: u32 = 256 * 1024 * 1024;

const CONFIG: Config = Config {
allow_missing_func_imports: true,
cache_path: None,
Expand All @@ -69,7 +72,7 @@ const CONFIG: Config = Config {
// the stack limit set by the wasmtime.
deterministic_stack_limit: Some(DeterministicStackLimit {
logical_max: 65536,
native_stack_max: 256 * 1024 * 1024,
native_stack_max: NATIVE_STACK_MAX,
}),
canonicalize_nans: true,
// Rationale for turning the multi-threaded compilation off is to make the preparation time
Expand Down Expand Up @@ -115,6 +118,74 @@ pub unsafe fn execute(
compiled_artifact_path: &Path,
params: &[u8],
spawner: impl sp_core::traits::SpawnNamed + 'static,
) -> Result<Vec<u8>, String> {
// Wasmtime powers the Substrate Executor. It compiles the wasm bytecode into native code.
// That native code does not create any stacks and just reuses the stack of the thread that
// wasmtime was invoked from.
//
// Also, we configure the executor to provide the deterministic stack and that requires
// supplying the amount of the native stack space that wasm is allowed to use. This is
// realized by supplying the limit into `wasmtime::Config::max_wasm_stack`.
//
// There are quirks to that configuration knob:
//
// 1. It only limits the amount of stack space consumed by wasm but does not ensure nor check
// that the stack space is actually available.
//
// That means, if the calling thread has 1 MiB of stack space left and the wasm code consumes
// more, then the wasmtime limit will **not** trigger. Instead, the wasm code will hit the
// guard page and the Rust stack overflow handler will be triggered. That leads to an
// **abort**.
//
// 2. It cannot and does not limit the stack space consumed by Rust code.
//
// Meaning that if the wasm code leaves no stack space for Rust code, then the Rust code
// and that will abort the process as well.
//
// Typically on Linux the main thread gets the stack size specified by the `ulimit` and
// typically it's configured to 8 MiB. Rust's spawned threads are 2 MiB. OTOH, the
// NATIVE_STACK_MAX is set to 256 MiB. Not nearly enough.
//
// Hence we need to increase it.
//
// The simplest way to fix that is to spawn a new thread with the desired stack limit. One may
// think that it may be too expensive, but I disagree. The cost of creating a new thread is
// measured in microseconds, negligable compared to other costs.
ordian marked this conversation as resolved.
Show resolved Hide resolved
//
// The reasoning why we pick this particular size is:
//
// The default Rust thread stack limit 2 MiB + 256 MiB wasm stack.
let thread_stack_size = 2 * 1024 * 1024 + NATIVE_STACK_MAX as usize;

crossbeam_utils::thread::scope(|s| {
s.builder()
.name("wasm executor".to_string())
.stack_size(thread_stack_size)
.spawn(move |_| do_execute(compiled_artifact_path, params, spawner))
.map_err(|e| format!("failed to spawn executor thread: {}", e))?
.join()
.map_err(|e| {
format!(
"the executor thread panicked: {}",
crate::error::stringify_panic_payload(e)
)
})?
.map_err(|e| format!("executor failed: {}", e))
})
.map_err(|e| {
// despite the fact that we join the thread above and handle this, we still got an error.
// Not sure how this could've happened, but we just handle by returning an error.
format!(
"the executor thread panicked regardless of joining: {}",
crate::error::stringify_panic_payload(e)
)
})?
}

unsafe fn do_execute(
compiled_artifact_path: &Path,
params: &[u8],
spawner: impl sp_core::traits::SpawnNamed + 'static,
) -> Result<Vec<u8>, sc_executor_common::error::Error> {
let mut extensions = sp_externalities::Extensions::new();

Expand Down
20 changes: 4 additions & 16 deletions node/core/pvf/src/prepare/worker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ use async_std::{
};
use parity_scale_codec::{Decode, Encode};
use sp_core::hexdisplay::HexDisplay;
use std::{any::Any, panic, sync::Arc, time::Duration};
use std::{panic, sync::Arc, time::Duration};

/// The time period after which the preparation worker is considered unresponsive and will be killed.
// NOTE: If you change this make sure to fix the buckets of `pvf_preparation_time` metric.
Expand Down Expand Up @@ -294,20 +294,8 @@ fn prepare_artifact(code: &[u8]) -> Result<CompiledArtifact, PrepareError> {
Err(err) => Err(PrepareError::Preparation(format!("{:?}", err))),
}
})
.map_err(|panic_payload| PrepareError::Panic(stringify_panic_payload(panic_payload)))
.map_err(|panic_payload| {
PrepareError::Panic(crate::error::stringify_panic_payload(panic_payload))
})
.and_then(|inner_result| inner_result)
}

/// Attempt to convert an opaque panic payload to a string.
///
/// This is a best effort, and is not guaranteed to provide the most accurate value.
fn stringify_panic_payload(payload: Box<dyn Any + Send + 'static>) -> String {
match payload.downcast::<&'static str>() {
Ok(msg) => msg.to_string(),
Err(payload) => match payload.downcast::<String>() {
Ok(msg) => *msg,
// At least we tried...
Err(_) => "unkown panic payload".to_string(),
},
}
}