Skip to content

Commit

Permalink
Performance live heap profiling - deallocation code path (#298)
Browse files Browse the repository at this point in the history
* Performance live heap profiling - deallocation code path
Add a bitset to track the addresses that are kept for heap profiling.
Removal of the lock in deallocation code path
Add a stats on unmatched allocations which could be a source of increased CPU consumption
  • Loading branch information
r1viollet authored Oct 12, 2023
1 parent d078da2 commit 53fa554
Show file tree
Hide file tree
Showing 19 changed files with 530 additions and 116 deletions.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,7 @@ set(DD_PROFILING_SOURCES
src/ddprof_cmdline.cc
src/ddres_list.cc
src/ipc.cc
src/lib/address_bitset.cc
src/lib/allocation_tracker.cc
src/lib/dd_profiling.cc
src/lib/elfutils.cc
Expand Down
1 change: 1 addition & 0 deletions include/ddprof_stats.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
X(EVENT_COUNT, "event.count", STAT_GAUGE) \
X(EVENT_LOST, "event.lost", STAT_GAUGE) \
X(SAMPLE_COUNT, "sample.count", STAT_GAUGE) \
X(UNMATCHED_DEALLOCATION_COUNT, "unmatched_deallocation.count", STAT_GAUGE) \
X(TARGET_CPU_USAGE, "target_process.cpu_usage.millicores", STAT_GAUGE) \
X(UNWIND_AVG_TIME, "unwind.avg_time_ns", STAT_GAUGE) \
X(UNWIND_FRAMES, "unwind.frames", STAT_GAUGE) \
Expand Down
69 changes: 69 additions & 0 deletions include/lib/address_bitset.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// Unless explicitly stated otherwise all files in this repository are licensed
// under the Apache License Version 2.0. This product includes software
// developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present
// Datadog, Inc.
#pragma once

#include <atomic>
#include <memory>
#include <stdint.h>
#include <string.h>

namespace ddprof {
class AddressBitset {
// Number of bits is the number of addresses we can store
// We have one address per individual bit).
// so lets say you have 1111, you can store 4 addresses.
// We hash the address to a number (to have an equal probability of using
// all bits). Then we use the mask to position this address in our bitset.
// Addr -> Hash -> Mask (to get useable bits) -> Position in the bitset
// Note: the hashing step might be bad for cache locality.
public:
// Publish 1 Meg as default
constexpr static unsigned _k_default_bitset_size = 8 * 1024 * 1024;
explicit AddressBitset(unsigned bitset_size = 0) { init(bitset_size); }
AddressBitset(AddressBitset &&other) noexcept;
AddressBitset &operator=(AddressBitset &&other) noexcept;

AddressBitset(AddressBitset &other) = delete;
AddressBitset &operator=(AddressBitset &other) = delete;

// returns true if the element was inserted
bool add(uintptr_t addr);
// returns true if the element was removed
bool remove(uintptr_t addr);
void clear();
int count() const { return _nb_addresses; }

private:
static constexpr unsigned _k_max_bits_ignored = 4;
unsigned _lower_bits_ignored;
// element type
using Word_t = uint64_t;
constexpr static unsigned _nb_bits_per_word = sizeof(Word_t) * 8;
// 1 Meg divided in uint64's size
// The probability of collision is proportional to the number of elements
// already within the bitset
unsigned _bitset_size = {};
unsigned _k_nb_words = {};
unsigned _nb_bits_mask = {};
// We can not use an actual bitset (for atomicity reasons)
std::unique_ptr<std::atomic<uint64_t>[]> _address_bitset;
std::atomic<int> _nb_addresses = 0;

void init(unsigned bitset_size);

void move_from(AddressBitset &other) noexcept;
// This is a kind of hash function
// We remove the lower bits (as the alignment constraints makes them useless)
// We fold the address
// Then we only keep the bits that matter for the order in the bitmap
uint32_t hash_significant_bits(uintptr_t h1) {
uint64_t intermediate = h1 >> _lower_bits_ignored;
uint32_t high = (uint32_t)(intermediate >> 32);
uint32_t low = (uint32_t)intermediate;
uint32_t res = high ^ low;
return res & _nb_bits_mask;
}
};
} // namespace ddprof
9 changes: 6 additions & 3 deletions include/lib/allocation_tracker.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#pragma once

#include "address_bitset.hpp"
#include "allocation_tracker_tls.hpp"
#include "ddprof_base.hpp"
#include "ddres_def.hpp"
Expand Down Expand Up @@ -63,7 +64,7 @@ class AllocationTracker {
static TrackerThreadLocalState *get_tl_state();

private:
using AdressSet = std::unordered_set<uintptr_t>;
static constexpr unsigned k_ratio_max_elt_to_bitset_size = 16;

struct TrackerState {
void init(bool track_alloc, bool track_dealloc) {
Expand All @@ -86,7 +87,8 @@ class AllocationTracker {
uint64_t next_sample_interval(std::minstd_rand &gen) const;

DDRes init(uint64_t mem_profile_interval, bool deterministic_sampling,
uint32_t stack_sample_size, const RingBufferInfo &ring_buffer);
bool track_deallocations, uint32_t stack_sample_size,
const RingBufferInfo &ring_buffer);
void free();

static AllocationTracker *create_instance();
Expand Down Expand Up @@ -116,7 +118,8 @@ class AllocationTracker {
uint32_t _stack_sample_size;
PEvent _pevent;
bool _deterministic_sampling;
AdressSet _address_set;

AddressBitset _allocated_address_set;

// These can not be tied to the internal state of the instance.
// The creation of the instance depends on this
Expand Down
23 changes: 17 additions & 6 deletions include/lib/lib_logger.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,24 @@
#include <mutex>

namespace ddprof {
template <typename... Args>
void log_once(char const *const format, Args... args) {
#ifndef DEBUG
static std::once_flag flag;
std::call_once(flag, [&, format]() { fprintf(stderr, format, args...); });

#ifdef NDEBUG
template <typename Func>
void log_once_helper(std::once_flag &flag, Func &&func) {
std::call_once(flag, std::forward<Func>(func));
#else
fprintf(stderr, format, args...);
template <typename Func> void log_once_helper(std::once_flag &, Func &&func) {
func();
#endif
}

// create a once flag for the line and file where this is called:
#define LOG_ONCE(format, ...) \
do { \
static std::once_flag UNIQUE_ONCE_FLAG_##__COUNTER__; \
ddprof::log_once_helper(UNIQUE_ONCE_FLAG_##__COUNTER__, [&]() { \
fprintf(stderr, (format), ##__VA_ARGS__); \
}); \
} while (0)

} // namespace ddprof
2 changes: 1 addition & 1 deletion include/live_allocation-c.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ namespace liveallocation {
// build time override to reduce execution time of test
static constexpr auto kMaxTracked = KMAX_TRACKED_ALLOCATIONS;
#else
static constexpr auto kMaxTracked = 500000;
static constexpr auto kMaxTracked = 524288; // 2^19
#endif
} // namespace liveallocation
} // namespace ddprof
21 changes: 17 additions & 4 deletions include/live_allocation.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,10 @@ class LiveAllocation {
void register_deallocation(uintptr_t addr, int watcher_pos, pid_t pid) {
PidMap &pid_map = access_resize(_watcher_vector, watcher_pos);
PidStacks &pid_stacks = pid_map[pid];
register_deallocation(addr, pid_stacks._unique_stacks,
pid_stacks._address_map);
if (!register_deallocation(addr, pid_stacks._unique_stacks,
pid_stacks._address_map)) {
++_stats._unmatched_deallocations;
}
}

void clear_pid_for_watcher(int watcher_pos, pid_t pid) {
Expand All @@ -80,13 +82,24 @@ class LiveAllocation {
}
}

unsigned get_nb_unmatched_deallocations() const {
return _stats._unmatched_deallocations;
}

void cycle() { _stats = {}; }

private:
static void register_deallocation(uintptr_t address, PprofStacks &stacks,
// returns true if the deallocation was registered
static bool register_deallocation(uintptr_t address, PprofStacks &stacks,
AddressMap &address_map);

static void register_allocation(const UnwindOutput &uo, uintptr_t address,
// returns true if the allocation was registerd
static bool register_allocation(const UnwindOutput &uo, uintptr_t address,
int64_t value, PprofStacks &stacks,
AddressMap &address_map);
struct {
unsigned _unmatched_deallocations = {};
} _stats;
};

} // namespace ddprof
18 changes: 11 additions & 7 deletions src/ddprof_worker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,10 @@ DDRes symbols_update_stats(const SymbolHdr &symbol_hdr) {
}

/// Retrieve cpu / memory info
DDRes worker_update_stats(ProcStatus *procstat, const UnwindState &us,
DDRes worker_update_stats(DDProfWorkerContext &worker_context,
std::chrono::nanoseconds cycle_duration) {
ProcStatus *procstat = &worker_context.proc_status;
const UnwindState &us = *worker_context.us;
const DsoHdr &dso_hdr = us.dso_hdr;
// Update the procstats, but first snapshot the utime so we can compute the
// diff for the utime metric
Expand All @@ -122,7 +124,9 @@ DDRes worker_update_stats(ProcStatus *procstat, const UnwindState &us,
ddprof_stats_set(STATS_DSO_NEW_DSO,
dso_hdr._stats.sum_event_metric(DsoStats::kNewDso));
ddprof_stats_set(STATS_DSO_SIZE, dso_hdr.get_nb_dso());

ddprof_stats_set(
STATS_UNMATCHED_DEALLOCATION_COUNT,
worker_context.live_allocation.get_nb_unmatched_deallocations());
// Symbol stats
DDRES_CHECK_FWD(symbols_update_stats(us.symbol_hdr));

Expand All @@ -134,7 +138,6 @@ DDRes worker_update_stats(ProcStatus *procstat, const UnwindState &us,

long nsamples = 0;
ddprof_stats_get(STATS_SAMPLE_COUNT, &nsamples);

long tsc_cycles;
ddprof_stats_get(STATS_UNWIND_AVG_TIME, &tsc_cycles);
int64_t const avg_unwind_ns =
Expand Down Expand Up @@ -391,7 +394,9 @@ DDRes ddprof_pr_sample(DDProfContext &ctx, perf_event_sample *sample,
// Aggregate if unwinding went well (todo : fatal error propagation)
if (!IsDDResFatal(res)) {
struct UnwindState *us = ctx.worker_ctx.us;
if (Any(EventAggregationMode::kLiveSum & watcher->aggregation_mode)) {
if (Any(EventAggregationMode::kLiveSum & watcher->aggregation_mode) &&
sample->addr) {
// null address means we should not account it
ctx.worker_ctx.live_allocation.register_allocation(
us->output, sample->addr, sample->period, watcher_pos, sample->pid);
}
Expand Down Expand Up @@ -499,8 +504,7 @@ DDRes ddprof_worker_cycle(DDProfContext &ctx,
ctx.worker_ctx.cycle_start_time = cycle_now;

// Scrape procfs for process usage statistics
DDRES_CHECK_FWD(worker_update_stats(&ctx.worker_ctx.proc_status,
*ctx.worker_ctx.us, cycle_duration));
DDRES_CHECK_FWD(worker_update_stats(ctx.worker_ctx, cycle_duration));

// And emit diagnostic output (if it's enabled)
print_diagnostics(ctx.worker_ctx.us->dso_hdr);
Expand All @@ -527,7 +531,7 @@ DDRes ddprof_worker_cycle(DDProfContext &ctx,
export_time_set(ctx);
}
unwind_cycle(ctx.worker_ctx.us);

ctx.worker_ctx.live_allocation.cycle();
// Reset stats relevant to a single cycle
ddprof_reset_worker_stats();

Expand Down
120 changes: 120 additions & 0 deletions src/lib/address_bitset.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
// Unless explicitly stated otherwise all files in this repository are licensed
// under the Apache License Version 2.0. This product includes software
// developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present
// Datadog, Inc.
#include "address_bitset.hpp"

#include <algorithm>
#include <bit>
#include <functional>
#include <unlikely.hpp>

namespace ddprof {

namespace {
unsigned round_up_to_power_of_two(unsigned num) {
if (num == 0) {
return num;
}
// If num is already a power of two
if ((num & (num - 1)) == 0) {
return num;
}
// not a power of two
unsigned count = 0;
while (num) {
num >>= 1;
count++;
}
return 1 << count;
}
} // namespace

AddressBitset::AddressBitset(AddressBitset &&other) noexcept {
move_from(other);
}

AddressBitset &AddressBitset::operator=(AddressBitset &&other) noexcept {
if (this != &other) {
move_from(other);
}
return *this;
}

void AddressBitset::move_from(AddressBitset &other) noexcept {
_lower_bits_ignored = other._lower_bits_ignored;
_bitset_size = other._bitset_size;
_k_nb_words = other._k_nb_words;
_nb_bits_mask = other._nb_bits_mask;
_address_bitset = std::move(other._address_bitset);
_nb_addresses.store(other._nb_addresses.load());

// Reset the state of 'other'
other._bitset_size = 0;
other._k_nb_words = 0;
other._nb_bits_mask = 0;
other._nb_addresses = 0;
}

void AddressBitset::init(unsigned bitset_size) {
// Due to memory alignment, on 64 bits we can assume that the first 4
// bits can be ignored
_lower_bits_ignored = _k_max_bits_ignored;
if (_address_bitset) {
_address_bitset.reset();
}
_bitset_size = round_up_to_power_of_two(bitset_size);
_k_nb_words = (_bitset_size) / (_nb_bits_per_word);
if (_bitset_size) {
_nb_bits_mask = _bitset_size - 1;
_address_bitset = std::make_unique<std::atomic<uint64_t>[]>(_k_nb_words);
}
}

bool AddressBitset::add(uintptr_t addr) {
const uint32_t significant_bits = hash_significant_bits(addr);
// As per nsavoire's comment, it is better to use separate operators
// than to use the div instruction which generates an extra function call
// Also, the usage of a power of two value allows for bit operations
const unsigned index_array = significant_bits / _nb_bits_per_word;
const unsigned bit_offset = significant_bits % _nb_bits_per_word;
const Word_t bit_in_element = (1UL << bit_offset);
// there is a possible race between checking the value
// and setting it
if (!(_address_bitset[index_array].fetch_or(bit_in_element) &
bit_in_element)) {
// check that the element was not already set
++_nb_addresses;
return true;
}
// Collision, element was already set
return false;
}

bool AddressBitset::remove(uintptr_t addr) {
const int significant_bits = hash_significant_bits(addr);
const unsigned index_array = significant_bits / _nb_bits_per_word;
const unsigned bit_offset = significant_bits % _nb_bits_per_word;
const Word_t bit_in_element = (1UL << bit_offset);
if (_address_bitset[index_array].fetch_and(~bit_in_element) &
bit_in_element) {
_nb_addresses.fetch_sub(1, std::memory_order_relaxed);
// in the unlikely event of a clear right at the wrong time, we could
// have a negative number of elements (though count desyncs are acceptable)
return true;
}
return false;
}

void AddressBitset::clear() {
for (unsigned i = 0; i < _k_nb_words; ++i) {
const Word_t original_value = _address_bitset[i].exchange(0);
// Count number of set bits in original_value
const int num_set_bits = std::popcount(original_value);
if (num_set_bits > 0) {
_nb_addresses.fetch_sub(num_set_bits, std::memory_order_relaxed);
}
}
}

} // namespace ddprof
Loading

0 comments on commit 53fa554

Please sign in to comment.