Skip to content

Commit

Permalink
Merge pull request #4195 from DataDog/ivoanjo/prof-11003-fix-profiler…
Browse files Browse the repository at this point in the history
…-otel-initialization

[PROF-11003] Fix unsafe initialization when using profiler with otel tracing
  • Loading branch information
ivoanjo authored Dec 5, 2024
2 parents f354358 + 47b1706 commit 6f2057b
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 17 deletions.
64 changes: 47 additions & 17 deletions ext/datadog_profiling_native_extension/collectors_thread_context.c
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,8 @@ static void trigger_sample_for_thread(
long current_monotonic_wall_time_ns,
ddog_CharSlice *ruby_vm_type,
ddog_CharSlice *class_name,
bool is_gvl_waiting_state
bool is_gvl_waiting_state,
bool is_safe_to_allocate_objects
);
static VALUE _native_thread_list(VALUE self);
static struct per_thread_context *get_or_create_context_for(VALUE thread, struct thread_context_collector_state *state);
Expand All @@ -246,7 +247,12 @@ static long cpu_time_now_ns(struct per_thread_context *thread_context);
static long thread_id_for(VALUE thread);
static VALUE _native_stats(VALUE self, VALUE collector_instance);
static VALUE _native_gc_tracking(VALUE self, VALUE collector_instance);
static void trace_identifiers_for(struct thread_context_collector_state *state, VALUE thread, struct trace_identifiers *trace_identifiers_result);
static void trace_identifiers_for(
struct thread_context_collector_state *state,
VALUE thread,
struct trace_identifiers *trace_identifiers_result,
bool is_safe_to_allocate_objects
);
static bool should_collect_resource(VALUE root_span);
static VALUE _native_reset_after_fork(DDTRACE_UNUSED VALUE self, VALUE collector_instance);
static VALUE thread_list(struct thread_context_collector_state *state);
Expand All @@ -259,7 +265,8 @@ static void ddtrace_otel_trace_identifiers_for(
VALUE *root_span,
VALUE *numeric_span_id,
VALUE active_span,
VALUE otel_values
VALUE otel_values,
bool is_safe_to_allocate_objects
);
static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self, VALUE collector_instance, VALUE skipped_samples);
static bool handle_gvl_waiting(
Expand All @@ -278,7 +285,8 @@ static VALUE _native_apply_delta_to_cpu_time_at_previous_sample_ns(DDTRACE_UNUSE
static void otel_without_ddtrace_trace_identifiers_for(
struct thread_context_collector_state *state,
VALUE thread,
struct trace_identifiers *trace_identifiers_result
struct trace_identifiers *trace_identifiers_result,
bool is_safe_to_allocate_objects
);
static struct otel_span otel_span_from(VALUE otel_context, VALUE otel_current_span_key);
static uint64_t otel_span_id_to_uint(VALUE otel_span_id);
Expand Down Expand Up @@ -647,7 +655,8 @@ static void update_metrics_and_sample(
current_monotonic_wall_time_ns,
NULL,
NULL,
is_gvl_waiting_state
is_gvl_waiting_state,
/* is_safe_to_allocate_objects: */ true // We called from a context that's safe to run any regular code, including allocations
);
}

Expand Down Expand Up @@ -833,7 +842,10 @@ static void trigger_sample_for_thread(
// These two labels are only used for allocation profiling; @ivoanjo: may want to refactor this at some point?
ddog_CharSlice *ruby_vm_type,
ddog_CharSlice *class_name,
bool is_gvl_waiting_state
bool is_gvl_waiting_state,
// If the Ruby VM is at a state that can allocate objects safely, or not. Added for allocation profiling: we're not
// allowed to allocate objects (or raise exceptions) when inside the NEWOBJ tracepoint.
bool is_safe_to_allocate_objects
) {
int max_label_count =
1 + // thread id
Expand Down Expand Up @@ -872,11 +884,11 @@ static void trigger_sample_for_thread(
}

struct trace_identifiers trace_identifiers_result = {.valid = false, .trace_endpoint = Qnil};
trace_identifiers_for(state, thread, &trace_identifiers_result);
trace_identifiers_for(state, thread, &trace_identifiers_result, is_safe_to_allocate_objects);

if (!trace_identifiers_result.valid && state->otel_context_enabled != OTEL_CONTEXT_ENABLED_FALSE) {
// If we couldn't get something with ddtrace, let's see if we can get some trace identifiers from opentelemetry directly
otel_without_ddtrace_trace_identifiers_for(state, thread, &trace_identifiers_result);
otel_without_ddtrace_trace_identifiers_for(state, thread, &trace_identifiers_result, is_safe_to_allocate_objects);
}

if (trace_identifiers_result.valid) {
Expand Down Expand Up @@ -1289,7 +1301,12 @@ static VALUE _native_gc_tracking(DDTRACE_UNUSED VALUE _self, VALUE collector_ins
}

// Assumption 1: This function is called in a thread that is holding the Global VM Lock. Caller is responsible for enforcing this.
static void trace_identifiers_for(struct thread_context_collector_state *state, VALUE thread, struct trace_identifiers *trace_identifiers_result) {
static void trace_identifiers_for(
struct thread_context_collector_state *state,
VALUE thread,
struct trace_identifiers *trace_identifiers_result,
bool is_safe_to_allocate_objects
) {
if (state->otel_context_enabled == OTEL_CONTEXT_ENABLED_ONLY) return;
if (state->tracer_context_key == MISSING_TRACER_CONTEXT_KEY) return;

Expand All @@ -1308,7 +1325,9 @@ static void trace_identifiers_for(struct thread_context_collector_state *state,

VALUE numeric_span_id = Qnil;

if (otel_values != Qnil) ddtrace_otel_trace_identifiers_for(state, &active_trace, &root_span, &numeric_span_id, active_span, otel_values);
if (otel_values != Qnil) {
ddtrace_otel_trace_identifiers_for(state, &active_trace, &root_span, &numeric_span_id, active_span, otel_values, is_safe_to_allocate_objects);
}

if (root_span == Qnil || (active_span == Qnil && numeric_span_id == Qnil)) return;

Expand Down Expand Up @@ -1474,7 +1493,8 @@ void thread_context_collector_sample_allocation(VALUE self_instance, unsigned in
INVALID_TIME, // For now we're not collecting timestamps for allocation events, as per profiling team internal discussions
&ruby_vm_type,
optional_class_name,
false
/* is_gvl_waiting_state: */ false,
/* is_safe_to_allocate_objects: */ false // Not safe to allocate further inside the NEWOBJ tracepoint
);
}

Expand Down Expand Up @@ -1529,11 +1549,18 @@ static VALUE read_otel_current_span_key_const(DDTRACE_UNUSED VALUE _unused) {
return rb_const_get(trace_module, rb_intern("CURRENT_SPAN_KEY"));
}

static VALUE get_otel_current_span_key(struct thread_context_collector_state *state) {
static VALUE get_otel_current_span_key(struct thread_context_collector_state *state, bool is_safe_to_allocate_objects) {
if (state->otel_current_span_key == Qtrue) { // Qtrue means we haven't tried to extract it yet
if (!is_safe_to_allocate_objects) {
// Calling read_otel_current_span_key_const below can trigger exceptions and arbitrary Ruby code running (e.g.
// `const_missing`, etc). Not safe to call in this situation, so we just skip otel info for this sample.
return Qnil;
}

// If this fails, we want to fail gracefully, rather than raise an exception (e.g. if the opentelemetry gem
// gets refactored, we should not fall on our face)
VALUE span_key = rb_protect(read_otel_current_span_key_const, Qnil, NULL);
rb_set_errinfo(Qnil); // **Clear any pending exception after ignoring it**

// Note that this gets set to Qnil if we failed to extract the correct value, and thus we won't try to extract it again
state->otel_current_span_key = span_key;
Expand All @@ -1550,7 +1577,8 @@ static void ddtrace_otel_trace_identifiers_for(
VALUE *root_span,
VALUE *numeric_span_id,
VALUE active_span,
VALUE otel_values
VALUE otel_values,
bool is_safe_to_allocate_objects
) {
VALUE resolved_numeric_span_id =
active_span == Qnil ?
Expand All @@ -1561,7 +1589,7 @@ static void ddtrace_otel_trace_identifiers_for(

if (resolved_numeric_span_id == Qnil) return;

VALUE otel_current_span_key = get_otel_current_span_key(state);
VALUE otel_current_span_key = get_otel_current_span_key(state, is_safe_to_allocate_objects);
if (otel_current_span_key == Qnil) return;
VALUE current_trace = *active_trace;

Expand Down Expand Up @@ -1640,14 +1668,15 @@ static VALUE _native_sample_skipped_allocation_samples(DDTRACE_UNUSED VALUE self
static void otel_without_ddtrace_trace_identifiers_for(
struct thread_context_collector_state *state,
VALUE thread,
struct trace_identifiers *trace_identifiers_result
struct trace_identifiers *trace_identifiers_result,
bool is_safe_to_allocate_objects
) {
VALUE context_storage = rb_thread_local_aref(thread, otel_context_storage_id /* __opentelemetry_context_storage__ */);

// If it exists, context_storage is expected to be an Array[OpenTelemetry::Context]
if (context_storage == Qnil || !RB_TYPE_P(context_storage, T_ARRAY)) return;

VALUE otel_current_span_key = get_otel_current_span_key(state);
VALUE otel_current_span_key = get_otel_current_span_key(state, is_safe_to_allocate_objects);
if (otel_current_span_key == Qnil) return;

int active_context_index = RARRAY_LEN(context_storage) - 1;
Expand Down Expand Up @@ -1939,7 +1968,8 @@ static uint64_t otel_span_id_to_uint(VALUE otel_span_id) {
gvl_waiting_started_wall_time_ns,
NULL,
NULL,
false // This is the extra sample before the wait begun; only the next sample will be in the gvl waiting state
/* is_gvl_waiting_state: */ false, // This is the extra sample before the wait begun; only the next sample will be in the gvl waiting state
/* is_safe_to_allocate_objects: */ true // This is similar to a regular cpu/wall sample, so it's also safe
);
}

Expand Down
49 changes: 49 additions & 0 deletions spec/datadog/profiling/collectors/thread_context_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -765,6 +765,55 @@ def otel_span_id_to_i(span_id)
expect(t1_sample.labels).to_not include("trace endpoint": anything)
end

context 'reading CURRENT_SPAN_KEY' do
let!(:ran_log) { [] }

let(:setup_failure) do
log = ran_log

stub_const(
"OpenTelemetry::Trace",
Module.new do
define_singleton_method(:const_missing) do |_value|
log << :ran_code
raise "Simulated failure"
end
end
)
end

context 'raises an exception' do
before { setup_failure }
after { expect(ran_log).to eq [:ran_code] }

it 'does not leave the exception pending' do
sample

expect($!).to be nil
end

it 'omits the "local root span id" and "span id" labels in the sample' do
sample

expect(t1_sample.labels.keys).to_not include(:"local root span id", :"span id")
end
end

context 'during allocation sampling' do
it 'does not try to read the CURRENT_SPAN_KEY' do
allow(OpenTelemetry.logger).to receive(:error)

otel_tracer.in_span("profiler.test") do |_span|
setup_failure

sample_allocation(weight: 1)
end

expect(ran_log).to eq []
end
end
end

context 'when otel_context_enabled is false' do
let(:otel_context_enabled) { false }

Expand Down

0 comments on commit 6f2057b

Please sign in to comment.