diff --git a/include/lib/allocation_tracker.hpp b/include/lib/allocation_tracker.hpp
index c2bfc9b2f..4ce24a3fb 100644
--- a/include/lib/allocation_tracker.hpp
+++ b/include/lib/allocation_tracker.hpp
@@ -5,16 +5,17 @@
 
 #pragma once
 
+#include "allocation_tracker_tls.hpp"
 #include "ddprof_base.hpp"
 #include "ddres_def.hpp"
 #include "pevent.hpp"
-#include "span.hpp"
+#include "reentry_guard.hpp"
 #include "unlikely.hpp"
 
 #include <atomic>
 #include <cstddef>
 #include <mutex>
-#include <random>
+#include <pthread.h>
 #include <unordered_set>
 
 namespace ddprof {
@@ -22,22 +23,6 @@ namespace ddprof {
 class MPSCRingBufferWriter;
 struct RingBufferInfo;
 
-struct TrackerThreadLocalState {
-  int64_t remaining_bytes; // remaining allocation bytes until next sample
-  bool remaining_bytes_initialized; // false if remaining_bytes is not
-                                    // initialized
-  ddprof::span<const byte> stack_bounds;
-
-  // In the choice of random generators, this one is smaller
-  // - smaller than mt19937 (8 vs 5K)
-  std::minstd_rand _gen{std::random_device{}()};
-
-  pid_t tid; // cache of tid
-
-  bool reentry_guard; // prevent reentry in AllocationTracker (eg. when
-                      // allocation are done inside AllocationTracker)
-};
-
 class AllocationTracker {
 public:
   friend class AllocationTrackerDisablerForCurrentThread;
@@ -63,11 +48,17 @@ class AllocationTracker {
   static void allocation_tracking_free();
 
   static inline DDPROF_NO_SANITIZER_ADDRESS void
-  track_allocation(uintptr_t addr, size_t size);
-  static inline void track_deallocation(uintptr_t addr);
+  track_allocation_s(uintptr_t addr, size_t size,
+                     TrackerThreadLocalState &tl_state);
+
+  static inline void track_deallocation_s(uintptr_t addr);
 
   static inline bool is_active();
 
+  static TrackerThreadLocalState *init_tl_state();
+  // can return null (does not init)
+  static TrackerThreadLocalState *get_tl_state();
+
 private:
   using AdressSet = std::unordered_set<uintptr_t>;
 
@@ -97,6 +88,10 @@ class AllocationTracker {
 
   static AllocationTracker *create_instance();
 
+  static void delete_tl_state(void *tl_state);
+
+  static void make_key();
+
   void track_allocation(uintptr_t addr, size_t size,
                         TrackerThreadLocalState &tl_state);
   void track_deallocation(uintptr_t addr, TrackerThreadLocalState &tl_state);
@@ -120,11 +115,18 @@ class AllocationTracker {
   bool _deterministic_sampling;
   AdressSet _address_set;
 
-  static thread_local TrackerThreadLocalState _tl_state;
+  // These can not be tied to the internal state of the instance.
+  // The creation of the instance depends on this
+  static pthread_once_t _key_once; // ensures we call key creation a single time
+  static pthread_key_t _tl_state_key;
+  // For Thread reentry guard of init_tl_state
+  static ThreadEntries _thread_entries;
+
   static AllocationTracker *_instance;
 };
 
-void AllocationTracker::track_allocation(uintptr_t addr, size_t size) {
+void AllocationTracker::track_allocation_s(uintptr_t addr, size_t size,
+                                           TrackerThreadLocalState &tl_state) {
   AllocationTracker *instance = _instance;
 
   // Be safe, if allocation tracker has not been initialized, just bail out
@@ -136,10 +138,6 @@ void AllocationTracker::track_allocation(uintptr_t addr, size_t size) {
     return;
   }
 
-  // In shared libraries, TLS access requires a call to tls_get_addr,
-  // therefore obtain a pointer on TLS state once and pass it around
-  TrackerThreadLocalState &tl_state = _tl_state;
-
   tl_state.remaining_bytes += size;
   if (likely(tl_state.remaining_bytes < 0)) {
     return;
@@ -155,19 +153,18 @@ void AllocationTracker::track_allocation(uintptr_t addr, size_t size) {
   }
 }
 
-void AllocationTracker::track_deallocation(uintptr_t addr) {
+void AllocationTracker::track_deallocation_s(uintptr_t addr) {
   // same pattern as track_allocation
   AllocationTracker *instance = _instance;
-
   if (!instance) {
     return;
   }
-  TrackerThreadLocalState &tl_state = _tl_state;
-
   if (instance->_state.track_deallocations.load(std::memory_order_relaxed)) {
-    // not cool as we are always calling this (high overhead). Can we do better
-    // ?
-    instance->track_deallocation(addr, tl_state);
+    TrackerThreadLocalState *tl_state = get_tl_state();
+    if (unlikely(!tl_state)) {
+      return;
+    }
+    instance->track_deallocation(addr, *tl_state);
   }
 }
 
diff --git a/include/lib/allocation_tracker_tls.hpp b/include/lib/allocation_tracker_tls.hpp
new file mode 100644
index 000000000..434d3122d
--- /dev/null
+++ b/include/lib/allocation_tracker_tls.hpp
@@ -0,0 +1,28 @@
+#pragma once
+
+#include "span.hpp"
+
+#include <cstdint>
+#include <random>
+#include <sys/types.h>
+
+namespace ddprof {
+
+struct TrackerThreadLocalState {
+  int64_t remaining_bytes; // remaining allocation bytes until next sample
+  bool remaining_bytes_initialized; // false if remaining_bytes is not
+                                    // initialized
+  ddprof::span<const byte> stack_bounds;
+
+  // In the choice of random generators, this one is smaller
+  // - smaller than mt19937 (8 vs 5K)
+  std::minstd_rand _gen{std::random_device{}()};
+
+  pid_t tid; // cache of tid
+
+  bool reentry_guard;         // prevent reentry in AllocationTracker (eg. when
+                              // allocation are done inside AllocationTracker)
+  bool double_tracking_guard; // prevent mmap tracking within a malloc
+};
+
+} // namespace ddprof
diff --git a/include/lib/lib_logger.hpp b/include/lib/lib_logger.hpp
new file mode 100644
index 000000000..49529406d
--- /dev/null
+++ b/include/lib/lib_logger.hpp
@@ -0,0 +1,22 @@
+// Unless explicitly stated otherwise all files in this repository are licensed
+// under the Apache License Version 2.0. This product includes software
+// developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present
+// Datadog, Inc.
+
+#pragma once
+
+#include <cstdarg>
+#include <cstdio>
+#include <mutex>
+
+namespace ddprof {
+template <typename... Args>
+void log_once(char const *const format, Args... args) {
+#ifndef DEBUG
+  static std::once_flag flag;
+  std::call_once(flag, [&, format]() { fprintf(stderr, format, args...); });
+#else
+  fprintf(stderr, format, args...);
+#endif
+}
+} // namespace ddprof
diff --git a/include/lib/reentry_guard.hpp b/include/lib/reentry_guard.hpp
new file mode 100644
index 000000000..38c0e0880
--- /dev/null
+++ b/include/lib/reentry_guard.hpp
@@ -0,0 +1,106 @@
+// Unless explicitly stated otherwise all files in this repository are licensed
+// under the Apache License Version 2.0. This product includes software
+// developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present
+// Datadog, Inc.
+
+#pragma once
+
+#include <array>
+#include <atomic>
+#include <thread>
+
+namespace ddprof {
+
+class ThreadEntries {
+public:
+  static constexpr size_t max_threads = 10;
+  std::array<std::atomic<pid_t>, max_threads> thread_entries;
+  ThreadEntries() { reset(); }
+  void reset() {
+    for (auto &entry : thread_entries) {
+      entry.store(-1, std::memory_order_relaxed);
+    }
+  }
+};
+
+class TLReentryGuard {
+public:
+  explicit TLReentryGuard(ThreadEntries &entries, pid_t tid)
+      : _entries(entries), _tid(tid), _ok(false), _index(-1) {
+    while (true) {
+      for (size_t i = 0; i < ThreadEntries::max_threads; ++i) {
+        pid_t expected = -1;
+        if (_entries.thread_entries[i].compare_exchange_weak(
+                expected, tid, std::memory_order_acq_rel)) {
+          _ok = true;
+          _index = i;
+          return;
+        } else if (expected == tid) {
+          // This thread is already in the entries.
+          return;
+        }
+      }
+      // If we've reached here, all slots are occupied and none of them belongs
+      // to this thread. Let's yield to other threads and then try again.
+      std::this_thread::yield();
+    }
+  }
+
+  ~TLReentryGuard() {
+    if (_ok) {
+      _entries.thread_entries[_index].store(-1, std::memory_order_release);
+    }
+  }
+
+  explicit operator bool() const { return _ok; }
+
+  TLReentryGuard(const TLReentryGuard &) = delete;
+  TLReentryGuard &operator=(const TLReentryGuard &) = delete;
+
+private:
+  ThreadEntries &_entries;
+  pid_t _tid;
+  bool _ok;
+  int _index;
+};
+
+class ReentryGuard {
+public:
+  explicit ReentryGuard(bool *reentry_guard)
+      : _reentry_guard(reentry_guard), _ok(false) {
+    if (_reentry_guard) {
+      _ok = (!*_reentry_guard);
+      *_reentry_guard = true;
+    }
+  }
+
+  ~ReentryGuard() {
+    if (_ok) {
+      *_reentry_guard = false;
+    }
+  }
+
+  bool register_guard(bool *reentry_guard) {
+    if (_reentry_guard) {
+      // not supported (already registered to other bool)
+      return false;
+    }
+    if (reentry_guard) {
+      _reentry_guard = reentry_guard;
+      _ok = (!*_reentry_guard);
+      *_reentry_guard = true;
+    }
+    return _ok;
+  }
+
+  explicit operator bool() const { return _ok; }
+
+  ReentryGuard(const ReentryGuard &) = delete;
+  ReentryGuard &operator=(const ReentryGuard &) = delete;
+
+private:
+  bool *_reentry_guard;
+  bool _ok;
+};
+
+} // namespace ddprof
diff --git a/src/lib/allocation_tracker.cc b/src/lib/allocation_tracker.cc
index 8b3aa7a3d..d095445d0 100644
--- a/src/lib/allocation_tracker.cc
+++ b/src/lib/allocation_tracker.cc
@@ -8,10 +8,9 @@
 #include "allocation_event.hpp"
 #include "ddprof_perf_event.hpp"
 #include "ddres.hpp"
-#include "defer.hpp"
 #include "ipc.hpp"
+#include "lib_logger.hpp"
 #include "live_allocation-c.hpp"
-#include "perf.hpp"
 #include "pevent_lib.hpp"
 #include "ringbuffer_utils.hpp"
 #include "savecontext.hpp"
@@ -19,7 +18,6 @@
 
 #include <atomic>
 #include <cassert>
-#include <cstdint>
 #include <cstdlib>
 
 #include <unistd.h>
@@ -32,30 +30,50 @@ struct LostEvent {
   uint64_t lost;
 };
 
-class ReentryGuard {
-public:
-  explicit ReentryGuard(bool *reentry_guard)
-      : _reentry_guard(reentry_guard), _ok(!*reentry_guard) {
-    *_reentry_guard = true;
-  }
-  ~ReentryGuard() {
-    if (_ok) {
-      *_reentry_guard = false;
-    }
-  }
+// Static declarations
+pthread_once_t AllocationTracker::_key_once = PTHREAD_ONCE_INIT;
 
-  explicit operator bool() const { return _ok; }
+pthread_key_t AllocationTracker::_tl_state_key;
+ThreadEntries AllocationTracker::_thread_entries;
 
-  ReentryGuard(const ReentryGuard &) = delete;
-  ReentryGuard &operator=(const ReentryGuard &) = delete;
+AllocationTracker *AllocationTracker::_instance;
 
-private:
-  bool *_reentry_guard;
-  bool _ok;
-};
+TrackerThreadLocalState *AllocationTracker::get_tl_state() {
+  // In shared libraries, TLS access requires a call to tls_get_addr,
+  // tls_get_addr can call into malloc, which can create a recursive loop
+  // instead we call pthread APIs to control the creation of TLS objects
+  pthread_once(&_key_once, make_key);
+  TrackerThreadLocalState *tl_state =
+      (TrackerThreadLocalState *)pthread_getspecific(_tl_state_key);
+  return tl_state;
+}
 
-AllocationTracker *AllocationTracker::_instance;
-thread_local TrackerThreadLocalState AllocationTracker::_tl_state;
+TrackerThreadLocalState *AllocationTracker::init_tl_state() {
+  TrackerThreadLocalState *tl_state = nullptr;
+  int res_set = 0;
+
+  pid_t tid = ddprof::gettid();
+  // As we allocate within this function, this can be called twice
+  TLReentryGuard tl_reentry_guard(_thread_entries, tid);
+  if (!tl_reentry_guard) {
+#ifdef DEBUG
+    fprintf(stderr, "Unable to grab reentry guard %d \n", tid);
+#endif
+    return tl_state;
+  }
+
+  tl_state = new TrackerThreadLocalState();
+  res_set = pthread_setspecific(_tl_state_key, tl_state);
+  tl_state->tid = tid;
+
+  if (res_set) {
+    // should return 0
+    log_once("Error: Unable to store tl_state. error %d \n", res_set);
+    delete tl_state;
+    tl_state = nullptr;
+  }
+  return tl_state;
+}
 
 AllocationTracker::AllocationTracker() {}
 
@@ -64,10 +82,29 @@ AllocationTracker *AllocationTracker::create_instance() {
   return &tracker;
 }
 
+void AllocationTracker::delete_tl_state(void *tl_state) {
+  delete (TrackerThreadLocalState *)tl_state;
+}
+
+void AllocationTracker::make_key() {
+  // delete is called on all key objects
+  pthread_key_create(&_tl_state_key, delete_tl_state);
+}
+
 DDRes AllocationTracker::allocation_tracking_init(
     uint64_t allocation_profiling_rate, uint32_t flags,
     uint32_t stack_sample_size, const RingBufferInfo &ring_buffer) {
-  ReentryGuard guard(&_tl_state.reentry_guard);
+  TrackerThreadLocalState *tl_state = get_tl_state();
+  if (!tl_state) {
+    // This is the time at which the init_tl_state should not fail
+    // We will not attempt to re-create it in other code paths
+    tl_state = init_tl_state();
+    if (!tl_state) {
+      return ddres_error(DD_WHAT_DWFL_LIB_ERROR);
+    }
+  }
+
+  ReentryGuard guard(&tl_state->reentry_guard);
 
   AllocationTracker *instance = create_instance();
   auto &state = instance->_state;
@@ -114,7 +151,7 @@ void AllocationTracker::free() {
 
   // Do not destroy the object:
   // there is an inherent race condition between checking
-  // `_state.track_allocation` and calling `_instance->track_allocation`.
+  // `_state.track_allocations ` and calling `_instance->track_allocation`.
   // That's why AllocationTracker is kept in a usable state and
   // `_track_allocation` is checked again in `_instance->track_allocation` while
   // taking the mutex lock.
@@ -126,7 +163,13 @@ void AllocationTracker::allocation_tracking_free() {
   if (!instance) {
     return;
   }
-  ReentryGuard guard(&_tl_state.reentry_guard);
+  TrackerThreadLocalState *tl_state = get_tl_state();
+  if (unlikely(!tl_state)) {
+    log_once("Error: Unable to find tl_state during %s\n", __FUNCTION__);
+    instance->free();
+    return;
+  }
+  ReentryGuard guard(&tl_state->reentry_guard);
   std::lock_guard lock{instance->_state.mutex};
   instance->free();
 }
@@ -200,8 +243,8 @@ void AllocationTracker::track_allocation(uintptr_t addr, size_t size,
       if (IsDDResOK(push_clear_live_allocation(tl_state))) {
         _address_set.clear();
       } else {
-        fprintf(
-            stderr,
+        log_once(
+            "Error: %s",
             "Stop allocation profiling. Unable to clear live allocation \n");
         free();
       }
@@ -447,17 +490,35 @@ uint64_t AllocationTracker::next_sample_interval(std::minstd_rand &gen) {
 }
 
 void AllocationTracker::notify_thread_start() {
-  TrackerThreadLocalState &tl_state = AllocationTracker::_tl_state;
+  TrackerThreadLocalState *tl_state = get_tl_state();
+  if (unlikely(!tl_state)) {
+    tl_state = init_tl_state();
+    if (!tl_state) {
+      log_once("Error: Unable to start allocation profiling on thread %d",
+               ddprof::gettid());
+      return;
+    }
+  }
 
-  ReentryGuard guard(&_tl_state.reentry_guard);
-  tl_state.stack_bounds = retrieve_stack_bounds();
+  ReentryGuard guard(&tl_state->reentry_guard);
+  tl_state->stack_bounds = retrieve_stack_bounds();
   // error can not be propagated in thread create
 }
 
 void AllocationTracker::notify_fork() {
+  _thread_entries.reset();
   if (_instance) {
     _instance->_state.pid = 0;
-    _tl_state.tid = 0;
+  }
+  TrackerThreadLocalState *tl_state = get_tl_state();
+  if (unlikely(!tl_state)) {
+    // The state should already exist if we forked.
+    // This would mean that we were not able to create the state before forking
+    log_once("Error: Unable to retrieve tl state after fork thread %d",
+             ddprof::gettid());
+    return;
+  } else {
+    tl_state->tid = 0;
   }
 }
 
diff --git a/src/lib/dd_profiling.cc b/src/lib/dd_profiling.cc
index e9410bdb1..b0be341f0 100644
--- a/src/lib/dd_profiling.cc
+++ b/src/lib/dd_profiling.cc
@@ -12,11 +12,13 @@
 #include "defer.hpp"
 #include "ipc.hpp"
 #include "lib_embedded_data.h"
+#include "lib_logger.hpp"
 #include "logger_setup.hpp"
 #include "signal_helper.hpp"
 #include "symbol_overrides.hpp"
 #include "syscalls.hpp"
 
+#include <cassert>
 #include <cerrno>
 #include <charconv>
 #include <chrono>
@@ -269,12 +271,19 @@ int ddprof_start_profiling_internal() {
         // \fixme{nsavoire} what should we do when allocation tracker init
         // fails ?
         g_state.allocation_profiling_started = true;
+      } else {
+        ddprof::log_once("Error: %s",
+                         "Failure to start allocation profiling\n");
       }
     }
   } catch (const ddprof::DDException &e) { return -1; }
 
   if (g_state.allocation_profiling_started) {
-    pthread_atfork(nullptr, nullptr, notify_fork);
+    int res = pthread_atfork(nullptr, nullptr, notify_fork);
+    if (res) {
+      ddprof::log_once("Error:%s", "Unable to setup notify fork");
+      assert(0);
+    }
   }
   g_state.started = true;
   set_profiler_library_active();
diff --git a/src/lib/glibc_fixes.c b/src/lib/glibc_fixes.c
index 6720c5bb2..439901e26 100644
--- a/src/lib/glibc_fixes.c
+++ b/src/lib/glibc_fixes.c
@@ -3,10 +3,18 @@
 // developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present
 // Datadog, Inc.
 
+#define _GNU_SOURCE // required for RTLD_NEXT
+
+#include <assert.h>
 #include <dlfcn.h>
 #include <stddef.h>
 #include <sys/stat.h>
 
+#ifdef DEBUG
+#  include <stdio.h>
+#  include <stdlib.h>
+#endif
+
 #ifndef _STAT_VER_LINUX
 #  ifndef __x86_64__
 #    define _STAT_VER_LINUX 0
@@ -47,6 +55,7 @@ __attribute__((unused)) int __fstat(int fd, struct stat *buf) {
   }
 
   // Should not happen
+  assert(0);
   return -1;
 }
 
@@ -67,6 +76,7 @@ __attribute__((unused)) int __stat(const char *pathname, struct stat *buf) {
   }
 
   // Should not happen
+  assert(0);
   return -1;
 }
 
@@ -84,20 +94,43 @@ extern int pthread_atfork(void (*prepare)(void), void (*parent)(void),
  * to provide our own definition... */
 int __pthread_atfork(void (*prepare)(void), void (*parent)(void),
                      void (*child)(void)) {
+  static __typeof(pthread_atfork) *s_func = NULL;
+
   // if __register_atfork is available (glibc), call it directly
   if (__register_atfork) {
+#ifdef DEBUG
+    fprintf(stderr, "We call __register_atfork \n");
+#endif
     return __register_atfork(prepare, parent, child, __dso_handle);
   }
 
-  static __typeof(pthread_atfork) *s_func = NULL;
   // we must be on musl, look up pthread_atfork
   if (s_func == NULL && dlsym) {
+#ifdef DEBUG
+    fprintf(stderr, "We look for pthread_atfork (musl code path)\n");
+#endif
     s_func = (__typeof(s_func))dlsym(RTLD_NEXT, "pthread_atfork");
+    if (s_func == NULL) {
+      // We need to look for default symbol when preloading
+      s_func = (__typeof(s_func))dlsym(RTLD_DEFAULT, "pthread_atfork");
+      if (s_func == &__pthread_atfork) {
+        // prevent infinite loop
+        s_func = NULL;
+      }
+    }
   }
+
   if (s_func) {
+#ifdef DEBUG
+    fprintf(stderr, "return through s_func \n");
+#endif
     return s_func(prepare, parent, child);
   }
 
   // Should not happen
+#ifdef DEBUG
+  fprintf(stderr, "FAIL \n");
+#endif
+  assert(0);
   return -1;
 }
diff --git a/src/lib/symbol_overrides.cc b/src/lib/symbol_overrides.cc
index 991a6ac92..395aff00a 100644
--- a/src/lib/symbol_overrides.cc
+++ b/src/lib/symbol_overrides.cc
@@ -8,6 +8,7 @@
 #include "allocation_tracker.hpp"
 #include "ddprof_base.hpp"
 #include "elfutils.hpp"
+#include "reentry_guard.hpp"
 #include "unlikely.hpp"
 
 #include <cstdlib>
@@ -44,27 +45,6 @@ timer_t g_timerid;
 int g_timer_sig = -1;
 int g_nb_loaded_libraries = -1;
 
-// \fixme{nsavoire} The goal of this flag is to avoid double-counting
-// mmaps that done inside malloc, not sure if it desirable or not.
-// We should probably merge this TL state with AllocationTracker::tl_state
-// to have a single TL state since accessing it is costly (call to tls_get_addr)
-thread_local bool g_in_allocator_guard = false;
-
-class Guard {
-public:
-  explicit Guard(bool *guard) : _guard(guard), _ok(!*guard) { *_guard = true; }
-  ~Guard() {
-    if (_ok) {
-      *_guard = false;
-    }
-  }
-  explicit operator bool() const { return _ok; }
-
-private:
-  bool *_guard;
-  bool _ok;
-};
-
 DDPROF_NOINLINE bool loaded_libraries_have_changed() {
   int nb = ddprof::count_loaded_libraries();
   if (nb != g_nb_loaded_libraries) {
@@ -92,10 +72,15 @@ struct malloc {
 
   static void *hook(size_t size) noexcept {
     check_libraries();
-    Guard guard(&g_in_allocator_guard);
+    ddprof::TrackerThreadLocalState *tl_state =
+        ddprof::AllocationTracker::get_tl_state();
+    ddprof::ReentryGuard guard(tl_state ? &(tl_state->double_tracking_guard)
+                                        : nullptr);
     auto ptr = ref(size);
-    ddprof::AllocationTracker::track_allocation(
-        reinterpret_cast<uintptr_t>(ptr), size);
+    if (guard) {
+      ddprof::AllocationTracker::track_allocation_s(
+          reinterpret_cast<uintptr_t>(ptr), size, *tl_state);
+    }
     return ptr;
   }
 };
@@ -111,7 +96,7 @@ struct free {
       return;
     }
 
-    ddprof::AllocationTracker::track_deallocation(
+    ddprof::AllocationTracker::track_deallocation_s(
         reinterpret_cast<uintptr_t>(ptr));
     ref(ptr);
   }
@@ -124,11 +109,15 @@ struct calloc {
 
   static void *hook(size_t nmemb, size_t size) noexcept {
     check_libraries();
-    Guard guard(&g_in_allocator_guard);
+    ddprof::TrackerThreadLocalState *tl_state =
+        ddprof::AllocationTracker::get_tl_state();
+    ddprof::ReentryGuard guard(tl_state ? &(tl_state->double_tracking_guard)
+                                        : nullptr);
     auto ptr = ref(nmemb, size);
-    ddprof::AllocationTracker::track_allocation(
-        reinterpret_cast<uintptr_t>(ptr), size * nmemb);
-
+    if (guard) {
+      ddprof::AllocationTracker::track_allocation_s(
+          reinterpret_cast<uintptr_t>(ptr), size * nmemb, *tl_state);
+    }
     return ptr;
   }
 };
@@ -140,16 +129,19 @@ struct realloc {
 
   static void *hook(void *ptr, size_t size) noexcept {
     check_libraries();
-    Guard guard(&g_in_allocator_guard);
     if (likely(ptr)) {
-      ddprof::AllocationTracker::track_deallocation(
+      ddprof::AllocationTracker::track_deallocation_s(
           reinterpret_cast<uintptr_t>(ptr));
     }
-
+    ddprof::TrackerThreadLocalState *tl_state =
+        ddprof::AllocationTracker::get_tl_state();
+    // lifetime of guard should exceed the call to ref function
+    ddprof::ReentryGuard guard(tl_state ? &(tl_state->double_tracking_guard)
+                                        : nullptr);
     auto newptr = ref(ptr, size);
-    if (likely(size)) {
-      ddprof::AllocationTracker::track_allocation(
-          reinterpret_cast<uintptr_t>(newptr), size);
+    if (likely(size) && guard) {
+      ddprof::AllocationTracker::track_allocation_s(
+          reinterpret_cast<uintptr_t>(newptr), size, *tl_state);
     }
 
     return newptr;
@@ -163,11 +155,14 @@ struct posix_memalign {
 
   static int hook(void **memptr, size_t alignment, size_t size) noexcept {
     check_libraries();
-    Guard guard(&g_in_allocator_guard);
+    ddprof::TrackerThreadLocalState *tl_state =
+        ddprof::AllocationTracker::get_tl_state();
+    ddprof::ReentryGuard guard(tl_state ? &(tl_state->double_tracking_guard)
+                                        : nullptr);
     auto ret = ref(memptr, alignment, size);
-    if (likely(!ret)) {
-      ddprof::AllocationTracker::track_allocation(
-          reinterpret_cast<uintptr_t>(*memptr), size);
+    if (likely(!ret) && guard) {
+      ddprof::AllocationTracker::track_allocation_s(
+          reinterpret_cast<uintptr_t>(*memptr), size, *tl_state);
     }
     return ret;
   }
@@ -180,11 +175,15 @@ struct aligned_alloc {
 
   static void *hook(size_t alignment, size_t size) noexcept {
     check_libraries();
-    Guard guard(&g_in_allocator_guard);
+    ddprof::TrackerThreadLocalState *tl_state =
+        ddprof::AllocationTracker::get_tl_state();
+    ddprof::ReentryGuard guard(tl_state ? &(tl_state->double_tracking_guard)
+                                        : nullptr);
     auto ptr = ref(alignment, size);
-    ddprof::AllocationTracker::track_allocation(
-        reinterpret_cast<uintptr_t>(ptr), size);
-
+    if (ptr && guard) {
+      ddprof::AllocationTracker::track_allocation_s(
+          reinterpret_cast<uintptr_t>(ptr), size, *tl_state);
+    }
     return ptr;
   }
 };
@@ -196,11 +195,15 @@ struct memalign {
 
   static void *hook(size_t alignment, size_t size) noexcept {
     check_libraries();
-    Guard guard(&g_in_allocator_guard);
+    ddprof::TrackerThreadLocalState *tl_state =
+        ddprof::AllocationTracker::get_tl_state();
+    ddprof::ReentryGuard guard(tl_state ? &(tl_state->double_tracking_guard)
+                                        : nullptr);
     auto ptr = ref(alignment, size);
-    ddprof::AllocationTracker::track_allocation(
-        reinterpret_cast<uintptr_t>(ptr), size);
-
+    if (ptr && guard) {
+      ddprof::AllocationTracker::track_allocation_s(
+          reinterpret_cast<uintptr_t>(ptr), size, *tl_state);
+    }
     return ptr;
   }
 };
@@ -212,11 +215,15 @@ struct pvalloc {
 
   static void *hook(size_t size) noexcept {
     check_libraries();
-    Guard guard(&g_in_allocator_guard);
+    ddprof::TrackerThreadLocalState *tl_state =
+        ddprof::AllocationTracker::get_tl_state();
+    ddprof::ReentryGuard guard(tl_state ? &(tl_state->double_tracking_guard)
+                                        : nullptr);
     auto ptr = ref(size);
-    ddprof::AllocationTracker::track_allocation(
-        reinterpret_cast<uintptr_t>(ptr), size);
-
+    if (ptr && guard) {
+      ddprof::AllocationTracker::track_allocation_s(
+          reinterpret_cast<uintptr_t>(ptr), size, *tl_state);
+    }
     return ptr;
   }
 };
@@ -228,11 +235,15 @@ struct valloc {
 
   static void *hook(size_t size) noexcept {
     check_libraries();
-    Guard guard(&g_in_allocator_guard);
+    ddprof::TrackerThreadLocalState *tl_state =
+        ddprof::AllocationTracker::get_tl_state();
+    ddprof::ReentryGuard guard(tl_state ? &(tl_state->double_tracking_guard)
+                                        : nullptr);
     auto ptr = ref(size);
-    ddprof::AllocationTracker::track_allocation(
-        reinterpret_cast<uintptr_t>(ptr), size);
-
+    if (ptr && guard) {
+      ddprof::AllocationTracker::track_allocation_s(
+          reinterpret_cast<uintptr_t>(ptr), size, *tl_state);
+    }
     return ptr;
   }
 };
@@ -244,14 +255,19 @@ struct reallocarray {
 
   static void *hook(void *ptr, size_t nmemb, size_t size) noexcept {
     check_libraries();
-    Guard guard(&g_in_allocator_guard);
     if (ptr) {
-      ddprof::AllocationTracker::track_deallocation(
+      ddprof::AllocationTracker::track_deallocation_s(
           reinterpret_cast<uintptr_t>(ptr));
     }
+    ddprof::TrackerThreadLocalState *tl_state =
+        ddprof::AllocationTracker::get_tl_state();
+    ddprof::ReentryGuard guard(tl_state ? &(tl_state->double_tracking_guard)
+                                        : nullptr);
     auto newptr = ref(ptr, nmemb, size);
-    ddprof::AllocationTracker::track_allocation(
-        reinterpret_cast<uintptr_t>(ptr), size * nmemb);
+    if (newptr && guard) {
+      ddprof::AllocationTracker::track_allocation_s(
+          reinterpret_cast<uintptr_t>(newptr), size * nmemb, *tl_state);
+    }
     return newptr;
   }
 };
@@ -315,12 +331,14 @@ struct mmap {
 
   static void *hook(void *addr, size_t length, int prot, int flags, int fd,
                     off_t offset) noexcept {
+    ddprof::TrackerThreadLocalState *tl_state =
+        ddprof::AllocationTracker::get_tl_state();
+    ddprof::ReentryGuard guard(tl_state ? &(tl_state->double_tracking_guard)
+                                        : nullptr);
     void *ptr = ref(addr, length, prot, flags, fd, offset);
-
-    if (addr == nullptr && fd == -1 && ptr != nullptr &&
-        !g_in_allocator_guard) {
-      ddprof::AllocationTracker::track_allocation(
-          reinterpret_cast<uintptr_t>(ptr), length);
+    if (addr == nullptr && fd == -1 && ptr != nullptr && guard) {
+      ddprof::AllocationTracker::track_allocation_s(
+          reinterpret_cast<uintptr_t>(ptr), length, *tl_state);
     }
     return ptr;
   }
@@ -333,12 +351,14 @@ struct mmap_ {
 
   static void *hook(void *addr, size_t length, int prot, int flags, int fd,
                     off_t offset) noexcept {
+    ddprof::TrackerThreadLocalState *tl_state =
+        ddprof::AllocationTracker::get_tl_state();
+    ddprof::ReentryGuard guard(tl_state ? &(tl_state->double_tracking_guard)
+                                        : nullptr);
     void *ptr = ref(addr, length, prot, flags, fd, offset);
-
-    if (addr == nullptr && fd == -1 && ptr != nullptr &&
-        !g_in_allocator_guard) {
-      ddprof::AllocationTracker::track_allocation(
-          reinterpret_cast<uintptr_t>(ptr), length);
+    if (addr == nullptr && fd == -1 && ptr != nullptr && guard) {
+      ddprof::AllocationTracker::track_allocation_s(
+          reinterpret_cast<uintptr_t>(ptr), length, *tl_state);
     }
     return ptr;
   }
@@ -351,12 +371,14 @@ struct mmap64_ {
 
   static void *hook(void *addr, size_t length, int prot, int flags, int fd,
                     off_t offset) noexcept {
+    ddprof::TrackerThreadLocalState *tl_state =
+        ddprof::AllocationTracker::get_tl_state();
+    ddprof::ReentryGuard guard(tl_state ? &(tl_state->double_tracking_guard)
+                                        : nullptr);
     void *ptr = ref(addr, length, prot, flags, fd, offset);
-
-    if (addr == nullptr && fd == -1 && ptr != nullptr &&
-        !g_in_allocator_guard) {
-      ddprof::AllocationTracker::track_allocation(
-          reinterpret_cast<uintptr_t>(ptr), length);
+    if (addr == nullptr && fd == -1 && ptr != nullptr && guard) {
+      ddprof::AllocationTracker::track_allocation_s(
+          reinterpret_cast<uintptr_t>(ptr), length, *tl_state);
     }
     return ptr;
   }
@@ -368,10 +390,8 @@ struct munmap {
   static inline bool ref_checked = false;
 
   static int hook(void *addr, size_t length) noexcept {
-    if (!g_in_allocator_guard) {
-      ddprof::AllocationTracker::track_deallocation(
-          reinterpret_cast<uintptr_t>(addr));
-    }
+    ddprof::AllocationTracker::track_deallocation_s(
+        reinterpret_cast<uintptr_t>(addr));
     return ref(addr, length);
   }
 };
@@ -382,10 +402,8 @@ struct munmap_ {
   static inline bool ref_checked = false;
 
   static int hook(void *addr, size_t length) noexcept {
-    if (!g_in_allocator_guard) {
-      ddprof::AllocationTracker::track_deallocation(
-          reinterpret_cast<uintptr_t>(addr));
-    }
+    ddprof::AllocationTracker::track_deallocation_s(
+        reinterpret_cast<uintptr_t>(addr));
     return ref(addr, length);
   }
 };
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 8190da73d..fa0445b57 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -5,6 +5,8 @@
 find_package(GTest REQUIRED)
 find_package(benchmark REQUIRED)
 
+add_subdirectory(no_tls)
+
 enable_testing()
 
 # On arm builds Leak Detection is barely usable: https://github.com/google/sanitizers/issues/703
@@ -334,6 +336,10 @@ add_unit_test(live_allocation-ut live_allocation-ut.cc ../src/live_allocation.cc
 add_unit_test(ddprof_process-ut ddprof_process-ut.cc ../src/container_id.cc
               ../src/ddprof_process.cc)
 
+add_unit_test(glibc_fixes-ut glibc_fixes-ut.cc ../src/lib/glibc_fixes.c LIBRARIES pthread)
+
+add_unit_test(reentry_guard-ut reentry_guard-ut.cc)
+
 add_unit_test(
   ddprof_cli-ut
   ddprof_cli-ut.cc
@@ -344,6 +350,8 @@ add_unit_test(
   ../src/tracepoint_config.cc
   LIBRARIES DDProf::Parser)
 
+add_unit_test(pthread_tls-ut pthread_tls-ut.cc)
+
 add_benchmark(savecontext-bench savecontext-bench.cc ../src/lib/pthread_fixes.cc
               ../src/lib/savecontext.cc ../src/lib/saveregisters.cc)
 
@@ -391,6 +399,11 @@ if(NOT CMAKE_BUILD_TYPE STREQUAL "SanitizedDebug")
     COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/simple_malloc-ut.sh
     WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
 
+  add_test(
+    NAME check_no_tls
+    COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/check_no_tls-ut.sh
+    WORKING_DIRECTORY ${CMAKE_BINARY_DIR})
+
   if(BUILD_UNIVERSAL_DDPROF)
     add_test(NAME check-injected-lib
              COMMAND ${CMAKE_SOURCE_DIR}/tools/check_for_unsafe_libc_functions.py
diff --git a/test/allocation_tracker-bench.cc b/test/allocation_tracker-bench.cc
index 25857c166..5fa708737 100644
--- a/test/allocation_tracker-bench.cc
+++ b/test/allocation_tracker-bench.cc
@@ -1,31 +1,66 @@
 #include <benchmark/benchmark.h>
 
+#include <atomic>
+#include <condition_variable>
+#include <mutex>
+#include <random>
+#include <thread>
+#include <vector>
+
 #include "allocation_tracker.hpp"
 #include "loghandle.hpp"
 #include "ringbuffer_holder.hpp"
 
-#include <thread>
+// Global bench settings
+// Activate live heap tracking
+// #define LIVE_HEAP
+// Sampling rate: default rate is 524288
+static constexpr uint64_t k_rate = 20000;
+
+#define READER_THREAD
+std::atomic<bool> reader_continue{true};
+
+// Reader worker thread function
+void read_buffer(ddprof::RingBufferHolder &holder) {
+  int nb_samples = 0;
+  while (reader_continue) {
+    ddprof::MPSCRingBufferReader reader(holder.get_ring_buffer());
+    auto buf = reader.read_sample();
+    if (!buf.empty()) {
+      ++nb_samples;
+      //      fprintf(stderr, "Yep, got sample ! \n");
+    }
+    std::chrono::microseconds(10000);
+  }
+  fprintf(stderr, "Reader thread exit, nb_samples=%d\n", nb_samples);
+}
 
 DDPROF_NOINLINE void my_malloc(size_t size, uintptr_t addr = 0xdeadbeef) {
-  ddprof::AllocationTracker::track_allocation(addr, size);
-  // prevent tail call optimization
-  DDPROF_BLOCK_TAIL_CALL_OPTIMIZATION();
+  ddprof::TrackerThreadLocalState *tl_state =
+      ddprof::AllocationTracker::get_tl_state();
+  if (tl_state) { // tl_state is null if we are not tracking allocations
+    ddprof::AllocationTracker::track_allocation_s(addr, size, *tl_state);
+    DDPROF_BLOCK_TAIL_CALL_OPTIMIZATION();
+  }
 }
 
 DDPROF_NOINLINE void my_free(uintptr_t addr) {
-  ddprof::AllocationTracker::track_deallocation(addr);
-  // prevent tail call optimization
+  ddprof::AllocationTracker::track_deallocation_s(addr);
   DDPROF_BLOCK_TAIL_CALL_OPTIMIZATION();
 }
+
 // Function to perform allocations and deallocations
 void perform_memory_operations(bool track_allocations,
                                benchmark::State &state) {
   LogHandle handle;
-  const uint64_t rate = 1;
+  const uint64_t rate = k_rate;
   const size_t buf_size_order = 8;
+#ifndef LIVE_HEAP
   uint32_t flags = ddprof::AllocationTracker::kDeterministicSampling;
-  //  uint32_t flags = ddprof::AllocationTracker::kDeterministicSampling |
-  //      ddprof::AllocationTracker::kTrackDeallocations;
+#else
+  uint32_t flags = ddprof::AllocationTracker::kDeterministicSampling |
+      ddprof::AllocationTracker::kTrackDeallocations;
+#endif
 
   ddprof::RingBufferHolder ring_buffer{buf_size_order,
                                        RingBufferType::kMPSCRingBuffer};
@@ -43,9 +78,13 @@ void perform_memory_operations(bool track_allocations,
   std::random_device rd;
   std::mt19937 gen(rd());
 
-  for (auto _ : state) {
-    //    state.PauseTiming();
+#ifdef READER_THREAD
+  // create reader worker thread
+  reader_continue = true;
+  std::thread reader_thread{read_buffer, std::ref(ring_buffer)};
+#endif
 
+  for (auto _ : state) {
     // Initialize threads and clear addresses
     threads.clear();
     std::vector<std::vector<uintptr_t>> thread_addresses(nb_threads);
@@ -67,8 +106,6 @@ void perform_memory_operations(bool track_allocations,
       t.join();
     }
 
-    //    state.ResumeTiming();
-
     threads.clear();
     for (int i = 0; i < nb_threads; ++i) {
       threads.emplace_back([&, i] {
@@ -82,18 +119,157 @@ void perform_memory_operations(bool track_allocations,
       t.join();
     }
   }
+#ifdef READER_THREAD
+  reader_continue = false;
+  reader_thread.join();
+#endif
   ddprof::AllocationTracker::allocation_tracking_free();
 }
 
 // Benchmark without allocation tracking
-static void BM_ThreadedAllocations_NoTracking(benchmark::State &state) {
+static void BM_ShortLived_NoTracking(benchmark::State &state) {
   perform_memory_operations(false, state);
 }
 
 // Benchmark with allocation tracking
-static void BM_ThreadedAllocations_Tracking(benchmark::State &state) {
+static void BM_ShortLived_Tracking(benchmark::State &state) {
   perform_memory_operations(true, state);
 }
 
-BENCHMARK(BM_ThreadedAllocations_NoTracking);
-BENCHMARK(BM_ThreadedAllocations_Tracking);
+class WorkerThread {
+public:
+  std::vector<uintptr_t> addresses;
+  bool allocate;
+
+  WorkerThread() : stop(false), perform_task(false) {
+    worker_thread = std::thread([this] {
+      while (!stop) {
+        std::unique_lock<std::mutex> lock(mutex);
+        cv.wait(lock, [this] { return perform_task || stop; });
+        if (stop)
+          return;
+
+        // Here you can perform the task for the thread
+        if (allocate) {
+          for (auto addr : addresses) {
+            my_malloc(1024, addr);
+          }
+        } else {
+          for (auto addr : addresses) {
+            my_free(addr);
+          }
+        }
+
+        perform_task = false;
+      }
+    });
+  }
+
+  void signal_task(bool allocate_task, const std::vector<uintptr_t> &addrs) {
+    {
+      std::lock_guard<std::mutex> lock(mutex);
+      addresses = addrs;
+      allocate = allocate_task;
+      perform_task = true;
+    }
+    cv.notify_one();
+  }
+
+  ~WorkerThread() {
+    stop = true;
+    cv.notify_one();
+    if (worker_thread.joinable()) {
+      worker_thread.join();
+    }
+  }
+
+private:
+  std::thread worker_thread;
+  std::condition_variable cv;
+  std::mutex mutex;
+  std::atomic<bool> stop;
+  std::atomic<bool> perform_task;
+};
+
+void perform_memory_operations_2(bool track_allocations,
+                                 benchmark::State &state) {
+  LogHandle handle;
+  const uint64_t rate = k_rate;
+  const size_t buf_size_order = 8;
+#ifndef LIVE_HEAP
+  uint32_t flags = ddprof::AllocationTracker::kDeterministicSampling;
+#else
+  uint32_t flags = ddprof::AllocationTracker::kDeterministicSampling |
+      ddprof::AllocationTracker::kTrackDeallocations;
+#endif
+  ddprof::RingBufferHolder ring_buffer{buf_size_order,
+                                       RingBufferType::kMPSCRingBuffer};
+
+  if (track_allocations) {
+    ddprof::AllocationTracker::allocation_tracking_init(
+        rate, flags, k_default_perf_stack_sample_size,
+        ring_buffer.get_buffer_info());
+  }
+
+#ifdef READER_THREAD
+  reader_continue = true;
+  std::thread reader_thread{read_buffer, std::ref(ring_buffer)};
+#endif
+  const int nb_threads = 4;
+  std::vector<WorkerThread> workers(nb_threads);
+  std::vector<std::vector<uintptr_t>> thread_addresses(nb_threads);
+
+  int num_allocations = 1000;
+  size_t page_size = 0x1000;
+  std::random_device rd;
+  std::mt19937 gen(rd());
+
+  for (int i = 0; i < nb_threads; ++i) {
+    std::uniform_int_distribution<> dis(i * page_size, (i + 1) * page_size - 1);
+    for (int j = 0; j < num_allocations; ++j) {
+      uintptr_t addr = dis(gen);
+      thread_addresses[i].push_back(addr);
+    }
+  }
+
+  for (auto _ : state) {
+    // Allocation phase
+    for (int i = 0; i < nb_threads; ++i) {
+      workers[i].signal_task(true, thread_addresses[i]);
+    }
+    // Add delay
+    std::this_thread::sleep_for(std::chrono::microseconds(100));
+
+    // Deallocation phase
+    for (int i = 0; i < nb_threads; ++i) {
+      workers[i].signal_task(false, thread_addresses[i]);
+    }
+    // Add delay
+    std::this_thread::sleep_for(std::chrono::microseconds(100));
+  }
+
+#ifdef READER_THREAD
+  reader_continue = false;
+  reader_thread.join();
+#endif
+
+  ddprof::AllocationTracker::allocation_tracking_free();
+}
+
+// Benchmark without allocation tracking
+static void BM_LongLived_NoTracking(benchmark::State &state) {
+  perform_memory_operations_2(false, state);
+}
+
+// Benchmark with allocation tracking
+static void BM_LongLived_Tracking(benchmark::State &state) {
+  perform_memory_operations_2(true, state);
+}
+
+// short lived threads
+BENCHMARK(BM_ShortLived_NoTracking)->MeasureProcessCPUTime()->UseRealTime();
+BENCHMARK(BM_ShortLived_Tracking)->MeasureProcessCPUTime()->UseRealTime();
+
+// longer lived threads (worker threads)
+BENCHMARK(BM_LongLived_NoTracking)->MeasureProcessCPUTime();
+BENCHMARK(BM_LongLived_Tracking)->MeasureProcessCPUTime();
diff --git a/test/allocation_tracker-ut.cc b/test/allocation_tracker-ut.cc
index e6ad6e279..6bca2f83e 100644
--- a/test/allocation_tracker-ut.cc
+++ b/test/allocation_tracker-ut.cc
@@ -18,13 +18,19 @@
 #include <unistd.h>
 
 DDPROF_NOINLINE void my_malloc(size_t size, uintptr_t addr = 0xdeadbeef) {
-  ddprof::AllocationTracker::track_allocation(addr, size);
+  ddprof::TrackerThreadLocalState *tl_state =
+      ddprof::AllocationTracker::get_tl_state();
+  ddprof::ReentryGuard guard(tl_state ? &(tl_state->double_tracking_guard)
+                                      : nullptr);
+  if (guard) {
+    ddprof::AllocationTracker::track_allocation_s(addr, size, *tl_state);
+  }
   // prevent tail call optimization
   getpid();
 }
 
 DDPROF_NOINLINE void my_free(uintptr_t addr) {
-  ddprof::AllocationTracker::track_deallocation(addr);
+  ddprof::AllocationTracker::track_deallocation_s(addr);
   // prevent tail call optimization
   getpid();
 }
@@ -120,7 +126,12 @@ TEST(allocation_tracker, stale_lock) {
 
   for (uint32_t i = 0;
        i < ddprof::AllocationTracker::k_max_consecutive_failures; ++i) {
-    ddprof::AllocationTracker::track_allocation(0xdeadbeef, 1);
+    ddprof::TrackerThreadLocalState *tl_state =
+        ddprof::AllocationTracker::get_tl_state();
+    assert(tl_state);
+    if (tl_state) {
+      ddprof::AllocationTracker::track_allocation_s(0xdeadbeef, 1, *tl_state);
+    }
   }
   ASSERT_FALSE(ddprof::AllocationTracker::is_active());
   ddprof::AllocationTracker::allocation_tracking_free();
diff --git a/test/check_no_tls-ut.sh b/test/check_no_tls-ut.sh
new file mode 100755
index 000000000..31b03a377
--- /dev/null
+++ b/test/check_no_tls-ut.sh
@@ -0,0 +1,5 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+LD_PRELOAD=./test/no_tls/libno_tls.so ./ddprof sleep 1
diff --git a/test/glibc_fixes-ut.cc b/test/glibc_fixes-ut.cc
new file mode 100644
index 000000000..4f19a04cf
--- /dev/null
+++ b/test/glibc_fixes-ut.cc
@@ -0,0 +1,44 @@
+// Unless explicitly stated otherwise all files in this repository are licensed
+// under the Apache License Version 2.0. This product includes software
+// developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present
+// Datadog, Inc.
+
+#include <atomic>
+#include <csignal>
+#include <gtest/gtest.h>
+#include <unistd.h>
+
+// assuming this variable is accessible from your test file
+static std::atomic<bool> g_child_called;
+static std::atomic<bool> g_parent_called;
+static std::atomic<bool> g_prepare_called;
+
+void prepare() { g_prepare_called = true; }
+
+void parent() { g_parent_called = true; }
+
+void child() { g_child_called = true; }
+
+TEST(GlibcFixes, pthread_atfork) {
+  g_child_called = false;
+  g_parent_called = false;
+  g_prepare_called = false;
+
+  // register handlers
+  EXPECT_EQ(0, pthread_atfork(prepare, parent, child));
+
+  pid_t child_pid = fork();
+
+  if (child_pid == 0) {
+    // This is the child process
+    EXPECT_TRUE(g_child_called);
+    exit(0);
+  } else {
+
+    // validate that the handlers were called
+    EXPECT_TRUE(g_prepare_called);
+    EXPECT_TRUE(g_parent_called);
+
+    waitpid(child_pid, nullptr, 0); // wait for child to finish
+  }
+}
diff --git a/test/no_tls/CMakeLists.txt b/test/no_tls/CMakeLists.txt
new file mode 100644
index 000000000..827309f18
--- /dev/null
+++ b/test/no_tls/CMakeLists.txt
@@ -0,0 +1 @@
+add_library(no_tls_lib SHARED no_tls_lib.c)
diff --git a/test/no_tls/no_tls_lib.c b/test/no_tls/no_tls_lib.c
new file mode 100644
index 000000000..64bdf26b2
--- /dev/null
+++ b/test/no_tls/no_tls_lib.c
@@ -0,0 +1,7 @@
+#include <stdio.h>
+#include <stdlib.h>
+
+void *__tls_get_addr(void *v) {
+  fprintf(stderr, "__tls_get_addr was called, which is not allowed!\n");
+  abort();
+}
diff --git a/test/no_tls/no_tls_lib.h b/test/no_tls/no_tls_lib.h
new file mode 100644
index 000000000..42075bf73
--- /dev/null
+++ b/test/no_tls/no_tls_lib.h
@@ -0,0 +1,3 @@
+#pragma once
+
+void *__tls_get_addr(void *v);
\ No newline at end of file
diff --git a/test/pthread_tls-ut.cc b/test/pthread_tls-ut.cc
new file mode 100644
index 000000000..db0acc19b
--- /dev/null
+++ b/test/pthread_tls-ut.cc
@@ -0,0 +1,86 @@
+// Unless explicitly stated otherwise all files in this repository are licensed
+// under the Apache License Version 2.0. This product includes software
+// developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present
+// Datadog, Inc.
+
+#include <gtest/gtest.h>
+
+#include <pthread.h>
+#include <unistd.h>
+
+constexpr int num_threads = 10;
+static pthread_key_t key;
+// +1 for main thread
+static std::array<bool, num_threads + 1> is_set = {};
+
+void *set_get_key(void *threadid) {
+  long tid = *((long *)(threadid));
+  void *ret_val = pthread_getspecific(key);
+  EXPECT_EQ(ret_val, nullptr);
+  pthread_setspecific(key, (void *)threadid);
+  ret_val = pthread_getspecific(key);
+  EXPECT_EQ(*(reinterpret_cast<long *>(ret_val)), tid);
+  is_set[tid] = true;
+  // Check that set does not clear
+  ret_val = pthread_getspecific(key);
+  EXPECT_EQ(*(reinterpret_cast<long *>(ret_val)), tid);
+  return nullptr;
+}
+static std::array<long, num_threads> thread_ids;
+
+TEST(PThreadTest, SetGetSpecific) {
+  pthread_key_create(&key, nullptr);
+  pthread_t threads[num_threads];
+
+  // Create threads
+
+  for (int i = 0; i < num_threads; ++i) {
+    thread_ids[i] = i;
+    pthread_create(&threads[i], nullptr, set_get_key, &(thread_ids[i]));
+  }
+
+  // Join threads
+  for (auto &thread : threads) {
+    pthread_join(thread, nullptr);
+  }
+  // check behaviour for main thread
+  static long main_thread_id = num_threads;
+  set_get_key(&main_thread_id);
+
+  // Check if set for all threads
+  for (bool val : is_set) {
+    EXPECT_TRUE(val);
+  }
+
+  if (fork() == 0) {
+    // Child process
+    // Reset the set values
+    for (bool &val : is_set) {
+      val = false;
+    }
+
+    // Create and join threads again
+    for (long i = 0; i < num_threads; ++i) {
+      pthread_create(&threads[i], nullptr, set_get_key, &(thread_ids[i]));
+    }
+    for (auto &thread : threads) {
+      pthread_join(thread, nullptr);
+    }
+
+    // Expect main thread to be set already
+    void *ret_val = pthread_getspecific(key);
+    EXPECT_EQ(*(reinterpret_cast<long *>(ret_val)), main_thread_id);
+    is_set[main_thread_id] = true;
+
+    // Check if set for all threads
+    for (bool val : is_set) {
+      EXPECT_TRUE(val);
+    }
+    _exit(0);
+  } else {
+    // Parent process
+    // Wait for the child process to finish
+    wait(nullptr);
+  }
+  pthread_key_delete(key);
+}
diff --git a/test/reentry_guard-ut.cc b/test/reentry_guard-ut.cc
new file mode 100644
index 000000000..4b733581e
--- /dev/null
+++ b/test/reentry_guard-ut.cc
@@ -0,0 +1,101 @@
+// Unless explicitly stated otherwise all files in this repository are licensed
+// under the Apache License Version 2.0. This product includes software
+// developed at Datadog (https://www.datadoghq.com/). Copyright 2021-Present
+// Datadog, Inc.
+
+#include <gtest/gtest.h>
+
+#include "lib/reentry_guard.hpp"
+#include "syscalls.hpp"
+
+TEST(ReentryGuardTest, basic) {
+  bool reentry_guard = false;
+  {
+    ddprof::ReentryGuard guard(&reentry_guard);
+    EXPECT_TRUE(static_cast<bool>(guard));
+    EXPECT_TRUE(reentry_guard);
+  }
+  EXPECT_FALSE(reentry_guard);
+}
+
+TEST(ReentryGuardTest, null_init) {
+  bool reentry_guard = false;
+  {
+    ddprof::ReentryGuard guard(nullptr);
+    EXPECT_FALSE(guard);
+    guard.register_guard(&reentry_guard);
+    EXPECT_TRUE(guard);
+    {
+      ddprof::ReentryGuard guard(&reentry_guard);
+      EXPECT_FALSE(guard);
+    }
+  }
+}
+
+TEST(TLReentryGuardTest, basic) {
+  ddprof::ThreadEntries entries;
+  pid_t tid = ddprof::gettid();
+
+  {
+    ddprof::TLReentryGuard guard(entries, tid);
+    EXPECT_TRUE(static_cast<bool>(guard));
+    // Reenter a second time
+    ddprof::TLReentryGuard guard2(entries, tid);
+    EXPECT_FALSE(static_cast<bool>(guard2));
+  }
+}
+
+TEST(TLReentryGuardTest, many_threads) {
+  ddprof::ThreadEntries entries;
+  std::vector<std::thread> threads;
+  for (int i = 0; i < 100; ++i) {
+    threads.emplace_back([&]() {
+      pid_t tid = ddprof::gettid();
+      ddprof::TLReentryGuard guard(entries, tid);
+      EXPECT_TRUE(static_cast<bool>(guard));
+      // Sleep to simulate work
+      std::this_thread::sleep_for(std::chrono::milliseconds(5));
+    });
+  }
+  // Join all threads
+  for (auto &thread : threads) {
+    thread.join();
+  }
+  // Check that all entries are reset
+  for (auto &entry : entries.thread_entries) {
+    EXPECT_EQ(entry.load(), -1);
+  }
+}
+
+TEST(TLReentryGuardTest, reqcuisition_many_threads) {
+  ddprof::ThreadEntries entries;
+  std::vector<std::thread> threads;
+  for (int i = 0; i < 100; ++i) {
+    threads.emplace_back([&]() {
+      pid_t tid = ddprof::gettid();
+
+      // First acquisition
+      {
+        ddprof::TLReentryGuard guard(entries, tid);
+        EXPECT_TRUE(static_cast<bool>(guard));
+        // Sleep to simulate work
+        std::this_thread::sleep_for(std::chrono::milliseconds(5));
+      }
+      // Re-acquisition
+      {
+        ddprof::TLReentryGuard guard(entries, tid);
+        EXPECT_TRUE(static_cast<bool>(guard));
+      }
+    });
+  }
+
+  // Join all threads
+  for (auto &thread : threads) {
+    thread.join();
+  }
+
+  // Check that all entries are reset
+  for (auto &entry : entries.thread_entries) {
+    EXPECT_EQ(entry.load(), -1);
+  }
+}
diff --git a/test/simple_malloc-ut.sh b/test/simple_malloc-ut.sh
index 5502b73cd..4d45c68b5 100755
--- a/test/simple_malloc-ut.sh
+++ b/test/simple_malloc-ut.sh
@@ -21,6 +21,7 @@ test_cpu_mask=$(python3 -c 'import random,os;print(hex(1 << random.choice(list(o
 
 opts="--loop 1000 --spin 100"
 log_file=$(mktemp "${PWD}/log.XXXXXX")
+# echo "logs available here $log_file"
 rm "${log_file}"
 export DD_PROFILING_NATIVE_LOG_MODE="${log_file}"
 
@@ -36,6 +37,7 @@ check() {
     expected_pids="$2"
     expected_tids="${3-$2}"
     # shellcheck disable=SC2086
+    # echo "running :${cmd}"
     taskset "${test_cpu_mask}" ${cmd}
     if [[ "${expected_pids}" -eq 1 ]]; then
         # Ugly workaround for tail bug that makes it wait indefinitely for new lines when `grep -q` exists: