Skip to content

Commit

Permalink
fix(prof): avoid dlclose if threads did not join
Browse files Browse the repository at this point in the history
If the PHP engine dlclose's the handle and the shared object is
unloaded while another thread is running, we've hit undefined behavior.
This also probably results in a crash (on platforms that unload instead
of no-op).

This may be the source of a crash when php-fpm does a log rotate. When
doing the rotate, php-fpm shuts down all workers. If a worker is slow to
process an upload and the timeout is hit, then we could hit this issue.
  • Loading branch information
morrisonlevi committed Feb 6, 2025
1 parent 9cc3c1f commit a33e3fe
Show file tree
Hide file tree
Showing 5 changed files with 132 additions and 47 deletions.
53 changes: 37 additions & 16 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions profiling/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ serde_json = {version = "1.0"}
rand = { version = "0.8.5" }
rand_distr = { version = "0.4.3" }
rustc-hash = "1.1.0"
thiserror = "2"
uuid = { version = "1.0", features = ["v4"] }

[dev-dependencies]
Expand Down
29 changes: 23 additions & 6 deletions profiling/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ mod exception;
mod timeline;

use crate::config::{SystemSettings, INITIAL_SYSTEM_SETTINGS};
use crate::profiling::ShutdownError;
use crate::zend::datadog_sapi_globals_request_info;
use bindings::{
self as zend, ddog_php_prof_php_version, ddog_php_prof_php_version_id, ZendExtension,
ZendResult,
Expand All @@ -46,8 +48,6 @@ use std::sync::{Arc, Once};
use std::time::{Duration, Instant};
use uuid::Uuid;

use crate::zend::datadog_sapi_globals_request_info;

/// Name of the profiling module and zend_extension. Must not contain any
/// interior null bytes and must be null terminated.
static PROFILER_NAME: &[u8] = b"datadog-profiling\0";
Expand Down Expand Up @@ -889,11 +889,28 @@ extern "C" fn startup(extension: *mut ZendExtension) -> ZendResult {
ZendResult::Success
}

extern "C" fn shutdown(_extension: *mut ZendExtension) {
extern "C" fn shutdown(extension: *mut ZendExtension) {
#[cfg(debug_assertions)]
trace!("shutdown({:p})", _extension);

Profiler::shutdown(Duration::from_secs(2));
trace!("shutdown({:p})", extension);

match Profiler::shutdown(Duration::from_secs(2)) {
Ok(hit_timeout) => {
// If a timeout was reached, then the thread is probably alive.
// This means the engine cannot unload our handle, or else we'd
// hit immediate undefined behavior (and likely crash).
if hit_timeout {
// SAFETY: during mshutdown, we have ownership of the extension
// struct. Our threads (which failed to join) do not mutate
// this struct at all either, providing no races.
unsafe { (*extension).handle = ptr::null_mut() }
}
}
Err(err) => match err {
// todo: do we actually need to panic/unwind here? We're already
// shutting down... can we just be graceful?
ShutdownError::ThreadPanic { payload } => std::panic::resume_unwind(payload.0),
},
}

// SAFETY: calling in shutdown before zai config is shutdown, and after
// all configuration is done being accessed. Well... in the happy-path,
Expand Down
61 changes: 46 additions & 15 deletions profiling/src/profiling/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,9 @@ use crate::bindings::ddog_php_prof_get_active_fiber_test as ddog_php_prof_get_ac

use crate::bindings::{datadog_php_profiling_get_profiling_context, zend_execute_data};
use crate::config::SystemSettings;
use crate::profiling::thread_utils::{JoinError, ThreadPanicError};
use crate::{CLOCKS, TAGS};
use chrono::Utc;
#[cfg(feature = "timeline")]
use core::{ptr, str};
use crossbeam_channel::{Receiver, Sender, TrySendError};
use datadog_profiling::api::{
Function, Label as ApiLabel, Location, Period, Sample, ValueType as ApiValueType,
Expand All @@ -35,11 +34,14 @@ use std::hash::Hash;
use std::intrinsics::transmute;
use std::mem::forget;
use std::num::NonZeroI64;
use std::ops::BitOrAssign;
use std::sync::atomic::{AtomicBool, AtomicPtr, AtomicU32, Ordering};
use std::sync::{Arc, Barrier};
use std::thread::JoinHandle;
use std::time::{Duration, Instant, SystemTime};

#[cfg(feature = "timeline")]
use core::{ptr, str};
#[cfg(feature = "timeline")]
use std::time::UNIX_EPOCH;

Expand Down Expand Up @@ -689,33 +691,53 @@ impl Profiler {
/// Completes the shutdown process; to start it, call [Profiler::stop]
/// before calling [Profiler::shutdown].
/// Note the timeout is per thread, and there may be multiple threads.
/// Returns Ok(true) if any thread hit a timeout.
///
/// Safety: only safe to be called in `SHUTDOWN`/`MSHUTDOWN` phase
pub fn shutdown(timeout: Duration) {
pub fn shutdown(timeout: Duration) -> Result<bool, ShutdownError> {
// SAFETY: the `take` access is a thread-safe API, and the PROFILER is
// not being mutated outside single-threaded phases such as minit and
// mshutdown.
if let Some(profiler) = unsafe { PROFILER.take() } {
profiler.join_collector_and_uploader(timeout);
Ok(profiler.join_collector_and_uploader(timeout)?)
} else {
Ok(false)
}
}

pub fn join_collector_and_uploader(self, timeout: Duration) {
fn join_collector_and_uploader(self, timeout: Duration) -> Result<bool, ThreadPanicError> {
if self.should_join.load(Ordering::SeqCst) {
thread_utils::join_timeout(
self.time_collector_handle,
timeout,
"Recent samples may be lost.",
);
let result = thread_utils::join_timeout(self.time_collector_handle, timeout);
let mut hit_timeout = if let Err(err) = result {
match err {
JoinError::ThreadPanic(err) => Err(err)?,
timeout_err => {
warn!("{}, recent samples may be lost", timeout_err);
true
}
}
} else {
false
};

// Wait for the time_collector to join, since that will drop
// the sender half of the channel that the uploader is
// holding, allowing it to finish.
thread_utils::join_timeout(
self.uploader_handle,
timeout,
"Recent samples are most likely lost.",
);
let result = thread_utils::join_timeout(self.uploader_handle, timeout);
hit_timeout.bitor_assign(if let Err(err) = result {
match err {
JoinError::ThreadPanic(err) => Err(err)?,
timeout_err => {
warn!("{}, recent samples are most likely lost", timeout_err);
true
}
}
} else {
false
});
Ok(hit_timeout)
} else {
Ok(false)
}
}

Expand Down Expand Up @@ -1288,6 +1310,15 @@ impl Profiler {
}
}

#[derive(thiserror::Error, Debug)]
pub enum ShutdownError {
#[error(transparent)]
ThreadPanic {
#[from]
payload: ThreadPanicError,
},
}

#[cfg(test)]
mod tests {
use super::*;
Expand Down
Loading

0 comments on commit a33e3fe

Please sign in to comment.