Skip to content

Commit

Permalink
Deallocation codepath performances - minor adjustments
Browse files Browse the repository at this point in the history
- Enforce power of two on size of bitset
- Change the log_once api to take into account the place where it is called from
- Naming updates
  • Loading branch information
r1viollet committed Sep 13, 2023
1 parent 0ed09fb commit 29f932e
Show file tree
Hide file tree
Showing 9 changed files with 120 additions and 52 deletions.
16 changes: 9 additions & 7 deletions include/lib/address_bitset.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,22 +20,22 @@ class AddressBitset {
// Note: the hashing step might be bad for cache locality.
public:
// Publish 1 Meg as default
constexpr static unsigned _k_default_max_addresses = 8 * 1024 * 1024;
constexpr static unsigned _k_default_bitset_size = 8 * 1024 * 1024;
AddressBitset(unsigned max_addresses = 0) { init(max_addresses); }
void init(unsigned max_addresses);
// returns true if the element was inserted
bool set(uintptr_t addr);
bool add(uintptr_t addr);
// returns true if the element was removed
bool unset(uintptr_t addr);
bool remove(uintptr_t addr);
void clear();
int nb_addresses() const { return _nb_addresses; }
int count() const { return _nb_addresses; }

private:
static constexpr unsigned _k_max_bits_ignored = 4;
unsigned _lower_bits_ignored;
// element type
using Elt_t = uint64_t;
constexpr static unsigned _nb_bits_per_elt = sizeof(Elt_t) * 8;
using Word_t = uint64_t;
constexpr static unsigned _nb_bits_per_elt = sizeof(Word_t) * 8;
// 1 Meg divided in uint64's size
// The probability of collision is proportional to the number of elements
// already within the bitset
Expand All @@ -46,11 +46,13 @@ class AddressBitset {
std::unique_ptr<std::atomic<uint64_t>[]> _address_bitset;
std::atomic<int> _nb_addresses = 0;

static unsigned int count_set_bits(Word_t w);

// This is a kind of hash function
// We remove the lower bits (as the alignment constraints makes them useless)
// We fold the address
// Then we only keep the bits that matter for the order in the bitmap
uint32_t remove_lower_bits(uintptr_t h1) {
uint32_t hash_significant_bits(uintptr_t h1) {
uint64_t intermediate = h1 >> _lower_bits_ignored;
uint32_t high = (uint32_t)(intermediate >> 32);
uint32_t low = (uint32_t)intermediate;
Expand Down
21 changes: 16 additions & 5 deletions include/lib/lib_logger.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,24 @@
#include <mutex>

namespace ddprof {
template <typename... Args>
void log_once(char const *const format, Args... args) {

#ifdef NDEBUG
static std::once_flag flag;
std::call_once(flag, [&, format]() { fprintf(stderr, format, args...); });
template <typename Func>
void log_once_helper(std::once_flag &flag, Func &&func) {
std::call_once(flag, std::forward<Func>(func));
#else
fprintf(stderr, format, args...);
template <typename Func> void log_once_helper(std::once_flag &, Func &&func) {
func();
#endif
}

// create a once flag for the line and file where this is called:
#define LOG_ONCE(format, ...) \
do { \
static std::once_flag UNIQUE_ONCE_FLAG_##__LINE__##__FILE__; \
ddprof::log_once_helper(UNIQUE_ONCE_FLAG_##__LINE__##__FILE__, [&]() { \
fprintf(stderr, (format), ##__VA_ARGS__); \
}); \
} while (0)

} // namespace ddprof
66 changes: 49 additions & 17 deletions src/lib/address_bitset.cc
Original file line number Diff line number Diff line change
Expand Up @@ -10,29 +10,48 @@

namespace ddprof {

namespace {
unsigned round_to_power_of_two(unsigned num) {
if (num == 0) {
return num;
}
// If max_addresses is already a power of two
if ((num & (num - 1)) == 0) {
return num;
}
// not a power of two
unsigned count = 0;
while (num) {
num >>= 1;
count++;
}
return 1 << count;
}
} // namespace

void AddressBitset::init(unsigned max_addresses) {
// Due to memory alignment, on 64 bits we can assume that the first 4
// bits can be ignored
_lower_bits_ignored = _k_max_bits_ignored;
if (_address_bitset) {
_address_bitset.reset();
}
_nb_bits = max_addresses;
_nb_bits = round_to_power_of_two(max_addresses);
_k_nb_elements = (_nb_bits) / (_nb_bits_per_elt);
if (_nb_bits) {
_nb_bits_mask = _nb_bits - 1;
_address_bitset = std::make_unique<std::atomic<uint64_t>[]>(_k_nb_elements);
}
}

bool AddressBitset::set(uintptr_t addr) {
uint32_t significant_bits = remove_lower_bits(addr);
bool AddressBitset::add(uintptr_t addr) {
uint32_t significant_bits = hash_significant_bits(addr);
// As per nsavoire's comment, it is better to use separate operators
// than to use the div instruction which generates an extra function call
// Also, the usage of a power of two value allows for bit operations
unsigned index_array = significant_bits / 64;
unsigned bit_offset = significant_bits % 64;
uint64_t bit_in_element = (1UL << bit_offset);
unsigned index_array = significant_bits / sizeof(Word_t);
unsigned bit_offset = significant_bits % sizeof(Word_t);
Word_t bit_in_element = (1UL << bit_offset);
// there is a possible race between checking the value
// and setting it
if (!(_address_bitset[index_array].fetch_or(bit_in_element) &
Expand All @@ -45,26 +64,39 @@ bool AddressBitset::set(uintptr_t addr) {
return false;
}

bool AddressBitset::unset(uintptr_t addr) {
uint64_t hash_addr = remove_lower_bits(addr);
int significant_bits = hash_addr & _nb_bits_mask;
unsigned index_array = significant_bits / 64;
unsigned bit_offset = significant_bits % 64;
uint64_t bit_in_element = (1UL << bit_offset);
bool AddressBitset::remove(uintptr_t addr) {
int significant_bits = hash_significant_bits(addr);
unsigned index_array = significant_bits / sizeof(Word_t);
unsigned bit_offset = significant_bits % sizeof(Word_t);
Word_t bit_in_element = (1UL << bit_offset);
if ((_address_bitset[index_array].fetch_xor(bit_in_element) &
bit_in_element) &&
likely(_nb_addresses.load(std::memory_order_relaxed) >= 0)) {
--_nb_addresses; // fetch_add - 1
bit_in_element)) {
_nb_addresses.fetch_sub(1, std::memory_order_relaxed);
// in the unlikely event of a clear right at the wrong time, we could
// have a negative number of elements (though count desyncs are acceptable)
return true;
}
return false;
}

unsigned int AddressBitset::count_set_bits(Word_t w) {
unsigned int count = 0;
while (w) {
count += w & 1;
w >>= 1;
}
return count;
}

void AddressBitset::clear() {
for (unsigned i = 0; i < _k_nb_elements; ++i) {
_address_bitset[i].store(0, std::memory_order_relaxed);
Word_t original_value = _address_bitset[i].exchange(0);
// Count number of set bits in original_value
int num_set_bits = count_set_bits(original_value);
if (num_set_bits > 0) {
_nb_addresses.fetch_sub(num_set_bits, std::memory_order_relaxed);
}
}
_nb_addresses.store(0);
}

} // namespace ddprof
20 changes: 10 additions & 10 deletions src/lib/allocation_tracker.cc
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ TrackerThreadLocalState *AllocationTracker::init_tl_state() {

if (res_set) {
// should return 0
log_once("Error: Unable to store tl_state. error %d \n", res_set);
LOG_ONCE("Error: Unable to store tl_state. error %d \n", res_set);
delete tl_state;
tl_state = nullptr;
}
Expand Down Expand Up @@ -169,7 +169,7 @@ void AllocationTracker::allocation_tracking_free() {
}
TrackerThreadLocalState *tl_state = get_tl_state();
if (unlikely(!tl_state)) {
log_once("Error: Unable to find tl_state during %s\n", __FUNCTION__);
LOG_ONCE("Error: Unable to find tl_state during %s\n", __FUNCTION__);
instance->free();
return;
}
Expand Down Expand Up @@ -234,18 +234,18 @@ void AllocationTracker::track_allocation(uintptr_t addr, size_t size,
uint64_t total_size = nsamples * sampling_interval;

if (_state.track_deallocations) {
if (_allocated_address_set.set(addr)) {
if (unlikely(_allocated_address_set.nb_addresses() >
if (_allocated_address_set.add(addr)) {
if (unlikely(_allocated_address_set.count() >
ddprof::liveallocation::kMaxTracked)) {
// Check if we reached max number of elements
// Clear elements if we reach too many
// todo: should we just stop new live tracking (like java) ?
if (IsDDResOK(push_clear_live_allocation(tl_state))) {
_allocated_address_set.clear();
// still set this as we are pushing the allocation to ddprof
_allocated_address_set.set(addr);
_allocated_address_set.add(addr);
} else {
log_once(
LOG_ONCE(
"Error: %s",
"Stop allocation profiling. Unable to clear live allocation \n");
free();
Expand All @@ -259,7 +259,7 @@ void AllocationTracker::track_allocation(uintptr_t addr, size_t size,
bool success = IsDDResOK(push_alloc_sample(addr, total_size, tl_state));
free_on_consecutive_failures(success);
if (unlikely(!success) && _state.track_deallocations) {
_allocated_address_set.unset(addr);
_allocated_address_set.remove(addr);
}
}

Expand All @@ -273,7 +273,7 @@ void AllocationTracker::track_deallocation(uintptr_t addr,
return;
}

if (!_state.track_deallocations || !_allocated_address_set.unset(addr)) {
if (!_state.track_deallocations || !_allocated_address_set.remove(addr)) {
return;
}

Expand Down Expand Up @@ -502,7 +502,7 @@ void AllocationTracker::notify_thread_start() {
if (unlikely(!tl_state)) {
tl_state = init_tl_state();
if (!tl_state) {
log_once("Error: Unable to start allocation profiling on thread %d",
LOG_ONCE("Error: Unable to start allocation profiling on thread %d",
ddprof::gettid());
return;
}
Expand All @@ -522,7 +522,7 @@ void AllocationTracker::notify_fork() {
if (unlikely(!tl_state)) {
// The state should already exist if we forked.
// This would mean that we were not able to create the state before forking
log_once("Error: Unable to retrieve tl state after fork thread %d",
LOG_ONCE("Error: Unable to retrieve tl state after fork thread %d",
ddprof::gettid());
return;
} else {
Expand Down
5 changes: 2 additions & 3 deletions src/lib/dd_profiling.cc
Original file line number Diff line number Diff line change
Expand Up @@ -272,16 +272,15 @@ int ddprof_start_profiling_internal() {
// fails ?
g_state.allocation_profiling_started = true;
} else {
ddprof::log_once("Error: %s",
"Failure to start allocation profiling\n");
LOG_ONCE("Error: %s", "Failure to start allocation profiling\n");
}
}
} catch (const ddprof::DDException &e) { return -1; }

if (g_state.allocation_profiling_started) {
int res = pthread_atfork(nullptr, nullptr, notify_fork);
if (res) {
ddprof::log_once("Error:%s", "Unable to setup notify fork");
LOG_ONCE("Error:%s", "Unable to setup notify fork");
assert(0);
}
}
Expand Down
2 changes: 2 additions & 0 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,8 @@ add_unit_test(pthread_tls-ut pthread_tls-ut.cc)

add_unit_test(address_bitset-ut address_bitset-ut.cc ../src/lib/address_bitset.cc)

add_unit_test(lib_logger-ut ./lib_logger-ut.cc)

add_benchmark(savecontext-bench savecontext-bench.cc ../src/lib/pthread_fixes.cc
../src/lib/savecontext.cc ../src/lib/saveregisters.cc)

Expand Down
16 changes: 8 additions & 8 deletions test/address_bitset-ut.cc
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,14 @@
namespace ddprof {

TEST(address_bitset, simple) {
AddressBitset address_bitset(AddressBitset::_k_default_max_addresses);
EXPECT_TRUE(address_bitset.set(0xbadbeef));
EXPECT_FALSE(address_bitset.set(0xbadbeef));
EXPECT_TRUE(address_bitset.unset(0xbadbeef));
AddressBitset address_bitset(AddressBitset::_k_default_bitset_size);
EXPECT_TRUE(address_bitset.add(0xbadbeef));
EXPECT_FALSE(address_bitset.add(0xbadbeef));
EXPECT_TRUE(address_bitset.remove(0xbadbeef));
}

TEST(address_bitset, many_addresses) {
AddressBitset address_bitset(AddressBitset::_k_default_max_addresses);
AddressBitset address_bitset(AddressBitset::_k_default_bitset_size);
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<uintptr_t> dis(
Expand All @@ -31,15 +31,15 @@ TEST(address_bitset, many_addresses) {
unsigned nb_elements = 100000;
for (unsigned i = 0; i < nb_elements; ++i) {
uintptr_t addr = dis(gen);
if (address_bitset.set(addr)) {
if (address_bitset.add(addr)) {
addresses.push_back(addr);
}
}
EXPECT_TRUE(nb_elements - (nb_elements / 10) < addresses.size());
for (auto addr : addresses) {
EXPECT_TRUE(address_bitset.unset(addr));
EXPECT_TRUE(address_bitset.remove(addr));
}
EXPECT_EQ(0, address_bitset.nb_addresses());
EXPECT_EQ(0, address_bitset.count());
}

// This test to tune the hash approach
Expand Down
22 changes: 22 additions & 0 deletions test/lib_logger-ut.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
// Unless explicitly stated otherwise all files in this repository are licensed
// under the Apache License Version 2.0. This product includes software
// developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present
// Datadog, Inc.

#include <gtest/gtest.h>

#include "lib/lib_logger.hpp"

namespace ddprof {

void foo_log(const std::string &str) { LOG_ONCE("%s\n", str.c_str()); }

TEST(LibLogger, simple) {
LOG_ONCE("something\n");
LOG_ONCE("else\n");
foo_log("one"); // this shows
foo_log("two"); // this does not show
const char *some_string = "some_string";
LOG_ONCE("Some string = %s\n", some_string);
}
} // namespace ddprof
4 changes: 2 additions & 2 deletions test/reentry_guard-ut.cc
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,8 @@ TEST(ReentryGuardTest, null_init) {
guard.register_guard(&reentry_guard);
EXPECT_TRUE(guard);
{
ddprof::ReentryGuard guard(&reentry_guard);
EXPECT_FALSE(guard);
ddprof::ReentryGuard guard2(&reentry_guard);
EXPECT_FALSE(guard2);
}
}
}
Expand Down

0 comments on commit 29f932e

Please sign in to comment.