Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Performance of deallocation code paths - tree bitset #307

Draft
wants to merge 16 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -198,6 +198,7 @@ set(DD_PROFILING_SOURCES
src/ddprof_cmdline.cc
src/ddres_list.cc
src/ipc.cc
src/lib/address_bitset.cc
src/lib/allocation_tracker.cc
src/lib/dd_profiling.cc
src/lib/elfutils.cc
Expand Down
2 changes: 2 additions & 0 deletions include/ddprof_perf_event.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@
#include <linux/perf_event.h>
#include <type_traits>

#include "perf.hpp"

// Extend the perf event types
// There are <30 different perf events (starting at 1000 seems safe)
enum : uint32_t {
Expand Down
48 changes: 48 additions & 0 deletions include/lib/address_bitset.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
// Unless explicitly stated otherwise all files in this repository are licensed
// under the Apache License Version 2.0. This product includes software
// developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present
// Datadog, Inc.
#pragma once

#include <atomic>
#include <memory>
#include <stdint.h>
#include <string.h>

namespace ddprof {
class AddressBitset {
public:
explicit AddressBitset() {}
~AddressBitset();
AddressBitset(AddressBitset &other) = delete;
AddressBitset &operator=(AddressBitset &other) = delete;

// returns true if the element was inserted
bool add(uintptr_t addr);
// returns true if the element was removed
bool remove(uintptr_t addr);
void clear();
int count() const { return _nb_addresses; }

private:
static constexpr unsigned _lower_bits_ignored = 4;
static constexpr int _k_default_max_mid_levels = 10;
// element type
using Word_t = uint64_t;
constexpr static unsigned _nb_bits_per_word = sizeof(Word_t) * 8;
static constexpr unsigned _k_nb_words = 4096 / 64; // (64)
static constexpr unsigned _nb_entries_per_level = 65536; // 2^16

struct LeafLevel {
std::atomic<Word_t> leaf[_k_nb_words] = {};
};

struct MidLevel {
std::atomic<LeafLevel *> mid[_nb_entries_per_level] = {};
};

std::atomic<MidLevel *> _top_level[_nb_entries_per_level] = {};
std::atomic<int> _nb_addresses = 0;
std::atomic<int> _nb_mid_levels = 0;
};
} // namespace ddprof
7 changes: 5 additions & 2 deletions include/lib/allocation_tracker.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

#pragma once

#include "address_bitset.hpp"
#include "allocation_tracker_tls.hpp"
#include "ddprof_base.hpp"
#include "ddres_def.hpp"
Expand Down Expand Up @@ -83,7 +84,8 @@ class AllocationTracker {
uint64_t next_sample_interval(std::minstd_rand &gen);

DDRes init(uint64_t mem_profile_interval, bool deterministic_sampling,
uint32_t stack_sample_size, const RingBufferInfo &ring_buffer);
bool track_deallocations, uint32_t stack_sample_size,
const RingBufferInfo &ring_buffer);
void free();

static AllocationTracker *create_instance();
Expand Down Expand Up @@ -113,7 +115,8 @@ class AllocationTracker {
uint32_t _stack_sample_size;
PEvent _pevent;
bool _deterministic_sampling;
AdressSet _address_set;

std::unique_ptr<AddressBitset> _allocated_address_set;

// These can not be tied to the internal state of the instance.
// The creation of the instance depends on this
Expand Down
36 changes: 30 additions & 6 deletions include/lib/lib_logger.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,37 @@
#include <mutex>

namespace ddprof {
template <typename... Args>
void log_once(char const *const format, Args... args) {
#ifndef DEBUG
static std::once_flag flag;
std::call_once(flag, [&, format]() { fprintf(stderr, format, args...); });

#ifdef NDEBUG
template <typename Func>
void log_once_helper(std::once_flag &flag, Func &&func) {
std::call_once(flag, std::forward<Func>(func));
#else
fprintf(stderr, format, args...);
template <typename Func> void log_once_helper(std::once_flag &, Func &&func) {
func();
#endif
}

template <typename Func>
void log_always_once_helper(std::once_flag &flag, Func &&func) {
std::call_once(flag, std::forward<Func>(func));
}

#define LOG_ALWAYS_ONCE(format, ...) \
do { \
static std::once_flag UNIQUE_ONCE_FLAG_##__COUNTER__; \
ddprof::log_always_once_helper(UNIQUE_ONCE_FLAG_##__COUNTER__, [&]() { \
fprintf(stderr, (format), ##__VA_ARGS__); \
}); \
} while (0)

// create a once flag for the line and file where this is called:
#define LOG_ONCE(format, ...) \
do { \
static std::once_flag UNIQUE_ONCE_FLAG_##__COUNTER__; \
ddprof::log_once_helper(UNIQUE_ONCE_FLAG_##__COUNTER__, [&]() { \
fprintf(stderr, (format), ##__VA_ARGS__); \
}); \
} while (0)

} // namespace ddprof
2 changes: 1 addition & 1 deletion include/live_allocation-c.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ namespace liveallocation {
// build time override to reduce execution time of test
static constexpr auto kMaxTracked = KMAX_TRACKED_ALLOCATIONS;
#else
static constexpr auto kMaxTracked = 500000;
static constexpr auto kMaxTracked = 524288; // 2^19
#endif
} // namespace liveallocation
} // namespace ddprof
3 changes: 2 additions & 1 deletion src/ddprof_worker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,8 @@ DDRes ddprof_pr_sample(DDProfContext &ctx, perf_event_sample *sample,
// Aggregate if unwinding went well (todo : fatal error propagation)
if (!IsDDResFatal(res)) {
struct UnwindState *us = ctx.worker_ctx.us;
if (Any(EventConfMode::kLiveCallgraph & watcher->output_mode)) {
if (Any(EventConfMode::kLiveCallgraph & watcher->output_mode) &&
sample->addr) { // null address means we should not account it
// Live callgraph mode
// for now we hard code the live aggregation mode
ctx.worker_ctx.live_allocation.register_allocation(
Expand Down
170 changes: 170 additions & 0 deletions src/lib/address_bitset.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
// Unless explicitly stated otherwise all files in this repository are licensed
// under the Apache License Version 2.0. This product includes software
// developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present
// Datadog, Inc.
#include "address_bitset.hpp"

#include <algorithm>
#include <bit>
#include <cassert>
#include <functional>
#include <lib_logger.hpp>
#include <unlikely.hpp>

namespace ddprof {

bool AddressBitset::add(uintptr_t addr) {
// Extract top 16 bits for top-level index
unsigned top_index = (addr >> 32) & 0xFFFF;

// If the entry at this index is null, allocate and try to atomically set it
MidLevel *expected_mid =
_top_level[top_index].load(std::memory_order_relaxed);
if (!expected_mid) {
if (_nb_mid_levels >= _k_default_max_mid_levels) {
// Every new level adds half a meg of overhead
LOG_ALWAYS_ONCE(
"<Warn> ddprof: Address bitset reached maximum number of mid levels\n");
return false;
}
expected_mid = new MidLevel();
MidLevel *old_mid = nullptr;
if (!_top_level[top_index].compare_exchange_strong(old_mid, expected_mid)) {
delete expected_mid;
expected_mid = old_mid;
} else {
++_nb_mid_levels;
}
assert(expected_mid);
if (!expected_mid) {
// something went wrong
return false;
}
}

// Extract middle 16 bits for mid-level index
unsigned mid_index = (addr >> 16) & 0xFFFF;

// If the entry at this mid-level index is null, allocate and try to
// atomically set it
LeafLevel *expected_leaf =
expected_mid->mid[mid_index].load(std::memory_order_acquire);
if (!expected_leaf) {
expected_leaf = new LeafLevel();
LeafLevel *old_leaf = nullptr;
if (!expected_mid->mid[mid_index].compare_exchange_strong(old_leaf,
expected_leaf)) {
delete expected_leaf;
expected_leaf = old_leaf;
}
assert(expected_leaf);
if (!expected_leaf) {
// something went wrong
return false;
}
}

// Extract lower 16 bits and ignore lower bits (12 remain)
unsigned leaf_index = (addr & 0xFFFF) >> _lower_bits_ignored;
unsigned index_array = leaf_index / _nb_bits_per_word;
assert(index_array < _k_nb_words);
unsigned bit_offset = leaf_index % _nb_bits_per_word;
Word_t bit_in_element = (1UL << bit_offset);

if (!(expected_leaf->leaf[index_array].fetch_or(bit_in_element) &
bit_in_element)) {
++_nb_addresses;
return true;
}
return false; // Collision
}

bool AddressBitset::remove(uintptr_t addr) {
// Extract top 16 bits for top-level index
unsigned top_index = (addr >> 32) & 0xFFFF;

// Try to get the mid-level pointer. If it's null, return false.
MidLevel *mid = _top_level[top_index].load(std::memory_order_acquire);
if (unlikely(!mid)) {
return false;
}

// Extract middle 16 bits for mid-level index
unsigned mid_index = (addr >> 16) & 0xFFFF;

// Try to get the leaf-level pointer from the mid-level. If it's null, return
// false.
LeafLevel *leaf = mid->mid[mid_index].load(std::memory_order_acquire);
if (unlikely(!leaf)) {
return false;
}

// Extract lower 16 bits and ignore lower bits (12 remain)
unsigned leaf_index = (addr & 0xFFFF) >> _lower_bits_ignored;
unsigned index_array = leaf_index / _nb_bits_per_word;
assert(index_array < _k_nb_words);
unsigned bit_offset = leaf_index % _nb_bits_per_word;
Word_t bit_in_element = (1UL << bit_offset);

// Use fetch_and to zero the bit
if (leaf->leaf[index_array].fetch_and(~bit_in_element) & bit_in_element) {
_nb_addresses.fetch_sub(1, std::memory_order_relaxed);
return true;
}
// Otherwise, the bit wasn't set to begin with, so return false
return false;
}

// TODO: the performance of this clear is horrible
// For now we will avoid calling it
void AddressBitset::clear() {
for (unsigned t = 0; t < _nb_entries_per_level; ++t) {
MidLevel *mid = _top_level[t].load(std::memory_order_acquire);
if (mid) { // if mid-level exists
for (unsigned m = 0; m < _nb_entries_per_level; ++m) {
LeafLevel *leaf = mid->mid[m].load(std::memory_order_acquire);
if (leaf) { // if leaf-level exists
for (unsigned l = 0; l < _k_nb_words; ++l) {
Word_t original_value = leaf->leaf[l].exchange(0);
// Count number of set bits in original_value
int num_set_bits = std::popcount(original_value);
if (num_set_bits > 0) {
_nb_addresses.fetch_sub(num_set_bits, std::memory_order_relaxed);
}
}
}
}
}
}
}

AddressBitset::~AddressBitset() {
#ifdef DEBUG
unsigned mid_count = 0;
unsigned leaf_count = 0;
#endif
for (unsigned t = 0; t < _nb_entries_per_level; ++t) {
MidLevel *mid = _top_level[t].load(std::memory_order_acquire);
if (mid) { // if mid-level exists
#ifdef DEBUG
++mid_count;
#endif
for (unsigned m = 0; m < _nb_entries_per_level; ++m) {
LeafLevel *leaf = mid->mid[m].load(std::memory_order_acquire);
if (leaf) { // if leaf-level exists
#ifdef DEBUG
++leaf_count;
#endif
delete leaf;
}
}
delete mid;
}
}
#ifdef DEBUG
fprintf(stderr, "Mid count = %u \n", mid_count);
fprintf(stderr, "Leaf count = %u \n", leaf_count);
#endif
}

} // namespace ddprof
Loading