Performance live heap profiling - deallocation code path (#298)

* Performance live heap profiling - deallocation code path Add a bitset to track the addresses that are kept for heap profiling. Removal of the lock in deallocation code path Add a stats on unmatched allocations which could be a source of increased CPU consumption
DataDog · Oct 12, 2023 · 53fa554 · 53fa554
1 parent d078da2
commit 53fa554
Show file tree

Hide file tree

Showing 19 changed files with 530 additions and 116 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -199,6 +199,7 @@ set(DD_PROFILING_SOURCES
     src/ddprof_cmdline.cc
     src/ddres_list.cc
     src/ipc.cc
+    src/lib/address_bitset.cc
     src/lib/allocation_tracker.cc
     src/lib/dd_profiling.cc
     src/lib/elfutils.cc

diff --git a/include/ddprof_stats.hpp b/include/ddprof_stats.hpp
@@ -14,6 +14,7 @@
   X(EVENT_COUNT, "event.count", STAT_GAUGE)                                    \
   X(EVENT_LOST, "event.lost", STAT_GAUGE)                                      \
   X(SAMPLE_COUNT, "sample.count", STAT_GAUGE)                                  \
+  X(UNMATCHED_DEALLOCATION_COUNT, "unmatched_deallocation.count", STAT_GAUGE)  \
   X(TARGET_CPU_USAGE, "target_process.cpu_usage.millicores", STAT_GAUGE)       \
   X(UNWIND_AVG_TIME, "unwind.avg_time_ns", STAT_GAUGE)                         \
   X(UNWIND_FRAMES, "unwind.frames", STAT_GAUGE)                                \

diff --git a/include/lib/address_bitset.hpp b/include/lib/address_bitset.hpp
@@ -0,0 +1,69 @@
+// Unless explicitly stated otherwise all files in this repository are licensed
+// under the Apache License Version 2.0. This product includes software
+// developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present
+// Datadog, Inc.
+#pragma once
+
+#include <atomic>
+#include <memory>
+#include <stdint.h>
+#include <string.h>
+
+namespace ddprof {
+class AddressBitset {
+  // Number of bits is the number of addresses we can store
+  // We have one address per individual bit).
+  // so lets say you have 1111, you can store 4 addresses.
+  // We hash the address to a number (to have an equal probability of using
+  // all bits). Then we use the mask to position this address in our bitset.
+  // Addr -> Hash -> Mask (to get useable bits) -> Position in the bitset
+  // Note: the hashing step might be bad for cache locality.
+public:
+  // Publish 1 Meg as default
+  constexpr static unsigned _k_default_bitset_size = 8 * 1024 * 1024;
+  explicit AddressBitset(unsigned bitset_size = 0) { init(bitset_size); }
+  AddressBitset(AddressBitset &&other) noexcept;
+  AddressBitset &operator=(AddressBitset &&other) noexcept;
+
+  AddressBitset(AddressBitset &other) = delete;
+  AddressBitset &operator=(AddressBitset &other) = delete;
+
+  // returns true if the element was inserted
+  bool add(uintptr_t addr);
+  // returns true if the element was removed
+  bool remove(uintptr_t addr);
+  void clear();
+  int count() const { return _nb_addresses; }
+
+private:
+  static constexpr unsigned _k_max_bits_ignored = 4;
+  unsigned _lower_bits_ignored;
+  // element type
+  using Word_t = uint64_t;
+  constexpr static unsigned _nb_bits_per_word = sizeof(Word_t) * 8;
+  // 1 Meg divided in uint64's size
+  // The probability of collision is proportional to the number of elements
+  // already within the bitset
+  unsigned _bitset_size = {};
+  unsigned _k_nb_words = {};
+  unsigned _nb_bits_mask = {};
+  // We can not use an actual bitset (for atomicity reasons)
+  std::unique_ptr<std::atomic<uint64_t>[]> _address_bitset;
+  std::atomic<int> _nb_addresses = 0;
+
+  void init(unsigned bitset_size);
+
+  void move_from(AddressBitset &other) noexcept;
+  // This is a kind of hash function
+  // We remove the lower bits (as the alignment constraints makes them useless)
+  // We fold the address
+  // Then we only keep the bits that matter for the order in the bitmap
+  uint32_t hash_significant_bits(uintptr_t h1) {
+    uint64_t intermediate = h1 >> _lower_bits_ignored;
+    uint32_t high = (uint32_t)(intermediate >> 32);
+    uint32_t low = (uint32_t)intermediate;
+    uint32_t res = high ^ low;
+    return res & _nb_bits_mask;
+  }
+};
+} // namespace ddprof
diff --git a/include/lib/allocation_tracker.hpp b/include/lib/allocation_tracker.hpp
@@ -5,6 +5,7 @@
 
 #pragma once
 
+#include "address_bitset.hpp"
 #include "allocation_tracker_tls.hpp"
 #include "ddprof_base.hpp"
 #include "ddres_def.hpp"
@@ -63,7 +64,7 @@ class AllocationTracker {
   static TrackerThreadLocalState *get_tl_state();
 
 private:
-  using AdressSet = std::unordered_set<uintptr_t>;
+  static constexpr unsigned k_ratio_max_elt_to_bitset_size = 16;
 
   struct TrackerState {
     void init(bool track_alloc, bool track_dealloc) {
@@ -86,7 +87,8 @@ class AllocationTracker {
   uint64_t next_sample_interval(std::minstd_rand &gen) const;
 
   DDRes init(uint64_t mem_profile_interval, bool deterministic_sampling,
-             uint32_t stack_sample_size, const RingBufferInfo &ring_buffer);
+             bool track_deallocations, uint32_t stack_sample_size,
+             const RingBufferInfo &ring_buffer);
   void free();
 
   static AllocationTracker *create_instance();
@@ -116,7 +118,8 @@ class AllocationTracker {
   uint32_t _stack_sample_size;
   PEvent _pevent;
   bool _deterministic_sampling;
-  AdressSet _address_set;
+
+  AddressBitset _allocated_address_set;
 
   // These can not be tied to the internal state of the instance.
   // The creation of the instance depends on this

diff --git a/include/lib/lib_logger.hpp b/include/lib/lib_logger.hpp
@@ -10,13 +10,24 @@
 #include <mutex>
 
 namespace ddprof {
-template <typename... Args>
-void log_once(char const *const format, Args... args) {
-#ifndef DEBUG
-  static std::once_flag flag;
-  std::call_once(flag, [&, format]() { fprintf(stderr, format, args...); });
+
+#ifdef NDEBUG
+template <typename Func>
+void log_once_helper(std::once_flag &flag, Func &&func) {
+  std::call_once(flag, std::forward<Func>(func));
 #else
-  fprintf(stderr, format, args...);
+template <typename Func> void log_once_helper(std::once_flag &, Func &&func) {
+  func();
 #endif
 }
+
+// create a once flag for the line and file where this is called:
+#define LOG_ONCE(format, ...)                                                  \
+  do {                                                                         \
+    static std::once_flag UNIQUE_ONCE_FLAG_##__COUNTER__;                      \
+    ddprof::log_once_helper(UNIQUE_ONCE_FLAG_##__COUNTER__, [&]() {            \
+      fprintf(stderr, (format), ##__VA_ARGS__);                                \
+    });                                                                        \
+  } while (0)
+
 } // namespace ddprof
diff --git a/include/live_allocation-c.hpp b/include/live_allocation-c.hpp
@@ -11,7 +11,7 @@ namespace liveallocation {
 // build time override to reduce execution time of test
 static constexpr auto kMaxTracked = KMAX_TRACKED_ALLOCATIONS;
 #else
-static constexpr auto kMaxTracked = 500000;
+static constexpr auto kMaxTracked = 524288; // 2^19
 #endif
 } // namespace liveallocation
 } // namespace ddprof
diff --git a/include/live_allocation.hpp b/include/live_allocation.hpp
@@ -65,8 +65,10 @@ class LiveAllocation {
   void register_deallocation(uintptr_t addr, int watcher_pos, pid_t pid) {
     PidMap &pid_map = access_resize(_watcher_vector, watcher_pos);
     PidStacks &pid_stacks = pid_map[pid];
-    register_deallocation(addr, pid_stacks._unique_stacks,
-                          pid_stacks._address_map);
+    if (!register_deallocation(addr, pid_stacks._unique_stacks,
+                               pid_stacks._address_map)) {
+      ++_stats._unmatched_deallocations;
+    }
   }
 
   void clear_pid_for_watcher(int watcher_pos, pid_t pid) {
@@ -80,13 +82,24 @@ class LiveAllocation {
     }
   }
 
+  unsigned get_nb_unmatched_deallocations() const {
+    return _stats._unmatched_deallocations;
+  }
+
+  void cycle() { _stats = {}; }
+
 private:
-  static void register_deallocation(uintptr_t address, PprofStacks &stacks,
+  // returns true if the deallocation was registered
+  static bool register_deallocation(uintptr_t address, PprofStacks &stacks,
                                     AddressMap &address_map);
 
-  static void register_allocation(const UnwindOutput &uo, uintptr_t address,
+  // returns true if the allocation was registerd
+  static bool register_allocation(const UnwindOutput &uo, uintptr_t address,
                                   int64_t value, PprofStacks &stacks,
                                   AddressMap &address_map);
+  struct {
+    unsigned _unmatched_deallocations = {};
+  } _stats;
 };
 
 } // namespace ddprof
diff --git a/src/ddprof_worker.cc b/src/ddprof_worker.cc
@@ -103,8 +103,10 @@ DDRes symbols_update_stats(const SymbolHdr &symbol_hdr) {
 }
 
 /// Retrieve cpu / memory info
-DDRes worker_update_stats(ProcStatus *procstat, const UnwindState &us,
+DDRes worker_update_stats(DDProfWorkerContext &worker_context,
                           std::chrono::nanoseconds cycle_duration) {
+  ProcStatus *procstat = &worker_context.proc_status;
+  const UnwindState &us = *worker_context.us;
   const DsoHdr &dso_hdr = us.dso_hdr;
   // Update the procstats, but first snapshot the utime so we can compute the
   // diff for the utime metric
@@ -122,7 +124,9 @@ DDRes worker_update_stats(ProcStatus *procstat, const UnwindState &us,
   ddprof_stats_set(STATS_DSO_NEW_DSO,
                    dso_hdr._stats.sum_event_metric(DsoStats::kNewDso));
   ddprof_stats_set(STATS_DSO_SIZE, dso_hdr.get_nb_dso());
-
+  ddprof_stats_set(
+      STATS_UNMATCHED_DEALLOCATION_COUNT,
+      worker_context.live_allocation.get_nb_unmatched_deallocations());
   // Symbol stats
   DDRES_CHECK_FWD(symbols_update_stats(us.symbol_hdr));
 
@@ -134,7 +138,6 @@ DDRes worker_update_stats(ProcStatus *procstat, const UnwindState &us,
 
   long nsamples = 0;
   ddprof_stats_get(STATS_SAMPLE_COUNT, &nsamples);
-
   long tsc_cycles;
   ddprof_stats_get(STATS_UNWIND_AVG_TIME, &tsc_cycles);
   int64_t const avg_unwind_ns =
@@ -391,7 +394,9 @@ DDRes ddprof_pr_sample(DDProfContext &ctx, perf_event_sample *sample,
   // Aggregate if unwinding went well (todo : fatal error propagation)
   if (!IsDDResFatal(res)) {
     struct UnwindState *us = ctx.worker_ctx.us;
-    if (Any(EventAggregationMode::kLiveSum & watcher->aggregation_mode)) {
+    if (Any(EventAggregationMode::kLiveSum & watcher->aggregation_mode) &&
+        sample->addr) {
+      // null address means we should not account it
       ctx.worker_ctx.live_allocation.register_allocation(
           us->output, sample->addr, sample->period, watcher_pos, sample->pid);
     }
@@ -499,8 +504,7 @@ DDRes ddprof_worker_cycle(DDProfContext &ctx,
   ctx.worker_ctx.cycle_start_time = cycle_now;
 
   // Scrape procfs for process usage statistics
-  DDRES_CHECK_FWD(worker_update_stats(&ctx.worker_ctx.proc_status,
-                                      *ctx.worker_ctx.us, cycle_duration));
+  DDRES_CHECK_FWD(worker_update_stats(ctx.worker_ctx, cycle_duration));
 
   // And emit diagnostic output (if it's enabled)
   print_diagnostics(ctx.worker_ctx.us->dso_hdr);
@@ -527,7 +531,7 @@ DDRes ddprof_worker_cycle(DDProfContext &ctx,
     export_time_set(ctx);
   }
   unwind_cycle(ctx.worker_ctx.us);
-
+  ctx.worker_ctx.live_allocation.cycle();
   // Reset stats relevant to a single cycle
   ddprof_reset_worker_stats();
 

diff --git a/src/lib/address_bitset.cc b/src/lib/address_bitset.cc
@@ -0,0 +1,120 @@
+// Unless explicitly stated otherwise all files in this repository are licensed
+// under the Apache License Version 2.0. This product includes software
+// developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present
+// Datadog, Inc.
+#include "address_bitset.hpp"
+
+#include <algorithm>
+#include <bit>
+#include <functional>
+#include <unlikely.hpp>
+
+namespace ddprof {
+
+namespace {
+unsigned round_up_to_power_of_two(unsigned num) {
+  if (num == 0) {
+    return num;
+  }
+  // If num is already a power of two
+  if ((num & (num - 1)) == 0) {
+    return num;
+  }
+  // not a power of two
+  unsigned count = 0;
+  while (num) {
+    num >>= 1;
+    count++;
+  }
+  return 1 << count;
+}
+} // namespace
+
+AddressBitset::AddressBitset(AddressBitset &&other) noexcept {
+  move_from(other);
+}
+
+AddressBitset &AddressBitset::operator=(AddressBitset &&other) noexcept {
+  if (this != &other) {
+    move_from(other);
+  }
+  return *this;
+}
+
+void AddressBitset::move_from(AddressBitset &other) noexcept {
+  _lower_bits_ignored = other._lower_bits_ignored;
+  _bitset_size = other._bitset_size;
+  _k_nb_words = other._k_nb_words;
+  _nb_bits_mask = other._nb_bits_mask;
+  _address_bitset = std::move(other._address_bitset);
+  _nb_addresses.store(other._nb_addresses.load());
+
+  // Reset the state of 'other'
+  other._bitset_size = 0;
+  other._k_nb_words = 0;
+  other._nb_bits_mask = 0;
+  other._nb_addresses = 0;
+}
+
+void AddressBitset::init(unsigned bitset_size) {
+  // Due to memory alignment, on 64 bits we can assume that the first 4
+  // bits can be ignored
+  _lower_bits_ignored = _k_max_bits_ignored;
+  if (_address_bitset) {
+    _address_bitset.reset();
+  }
+  _bitset_size = round_up_to_power_of_two(bitset_size);
+  _k_nb_words = (_bitset_size) / (_nb_bits_per_word);
+  if (_bitset_size) {
+    _nb_bits_mask = _bitset_size - 1;
+    _address_bitset = std::make_unique<std::atomic<uint64_t>[]>(_k_nb_words);
+  }
+}
+
+bool AddressBitset::add(uintptr_t addr) {
+  const uint32_t significant_bits = hash_significant_bits(addr);
+  // As per nsavoire's comment, it is better to use separate operators
+  // than to use the div instruction which generates an extra function call
+  // Also, the usage of a power of two value allows for bit operations
+  const unsigned index_array = significant_bits / _nb_bits_per_word;
+  const unsigned bit_offset = significant_bits % _nb_bits_per_word;
+  const Word_t bit_in_element = (1UL << bit_offset);
+  // there is a possible race between checking the value
+  // and setting it
+  if (!(_address_bitset[index_array].fetch_or(bit_in_element) &
+        bit_in_element)) {
+    // check that the element was not already set
+    ++_nb_addresses;
+    return true;
+  }
+  // Collision, element was already set
+  return false;
+}
+
+bool AddressBitset::remove(uintptr_t addr) {
+  const int significant_bits = hash_significant_bits(addr);
+  const unsigned index_array = significant_bits / _nb_bits_per_word;
+  const unsigned bit_offset = significant_bits % _nb_bits_per_word;
+  const Word_t bit_in_element = (1UL << bit_offset);
+  if (_address_bitset[index_array].fetch_and(~bit_in_element) &
+      bit_in_element) {
+    _nb_addresses.fetch_sub(1, std::memory_order_relaxed);
+    // in the unlikely event of a clear right at the wrong time, we could
+    // have a negative number of elements (though count desyncs are acceptable)
+    return true;
+  }
+  return false;
+}
+
+void AddressBitset::clear() {
+  for (unsigned i = 0; i < _k_nb_words; ++i) {
+    const Word_t original_value = _address_bitset[i].exchange(0);
+    // Count number of set bits in original_value
+    const int num_set_bits = std::popcount(original_value);
+    if (num_set_bits > 0) {
+      _nb_addresses.fetch_sub(num_set_bits, std::memory_order_relaxed);
+    }
+  }
+}
+
+} // namespace ddprof