From d1421ac9d49fbb97bdd58635d7e6828dd0843b2b Mon Sep 17 00:00:00 2001 From: Christopher Jones Date: Fri, 25 Aug 2023 11:06:57 -0500 Subject: [PATCH] Added AllocMonitor facility A general system to watch allocations/deallocations --- PerfTools/AllocMonitor/BuildFile.xml | 4 + PerfTools/AllocMonitor/README.md | 76 ++++ .../AllocMonitor/interface/AllocMonitorBase.h | 50 +++ .../interface/AllocMonitorRegistry.h | 140 ++++++++ PerfTools/AllocMonitor/plugins/BuildFile.xml | 3 + .../plugins/EventProcessingAllocMonitor.cc | 98 +++++ .../plugins/HistogrammingAllocMonitor.cc | 87 +++++ .../plugins/SimpleAllocMonitor.cc | 83 +++++ .../AllocMonitor/src/AllocMonitorBase.cc | 38 ++ .../AllocMonitor/src/AllocMonitorRegistry.cc | 108 ++++++ PerfTools/AllocMonitor/test/BuildFile.xml | 10 + .../test/test_catch2_AllocMonitorRegistry.cc | 177 +++++++++ PerfTools/AllocMonitor/test/test_proxies.cc | 199 +++++++++++ PerfTools/AllocMonitorPreload/BuildFile.xml | 6 + PerfTools/AllocMonitorPreload/README.md | 30 ++ .../AllocMonitorPreload/src/memory_proxies.cc | 338 ++++++++++++++++++ PerfTools/MaxMemoryPreload/BuildFile.xml | 6 + PerfTools/MaxMemoryPreload/README.md | 29 ++ PerfTools/MaxMemoryPreload/src/preload.cc | 81 +++++ 19 files changed, 1563 insertions(+) create mode 100644 PerfTools/AllocMonitor/BuildFile.xml create mode 100644 PerfTools/AllocMonitor/README.md create mode 100644 PerfTools/AllocMonitor/interface/AllocMonitorBase.h create mode 100644 PerfTools/AllocMonitor/interface/AllocMonitorRegistry.h create mode 100644 PerfTools/AllocMonitor/plugins/BuildFile.xml create mode 100644 PerfTools/AllocMonitor/plugins/EventProcessingAllocMonitor.cc create mode 100644 PerfTools/AllocMonitor/plugins/HistogrammingAllocMonitor.cc create mode 100644 PerfTools/AllocMonitor/plugins/SimpleAllocMonitor.cc create mode 100644 PerfTools/AllocMonitor/src/AllocMonitorBase.cc create mode 100644 PerfTools/AllocMonitor/src/AllocMonitorRegistry.cc create mode 100644 PerfTools/AllocMonitor/test/BuildFile.xml create mode 100644 PerfTools/AllocMonitor/test/test_catch2_AllocMonitorRegistry.cc create mode 100644 PerfTools/AllocMonitor/test/test_proxies.cc create mode 100644 PerfTools/AllocMonitorPreload/BuildFile.xml create mode 100644 PerfTools/AllocMonitorPreload/README.md create mode 100644 PerfTools/AllocMonitorPreload/src/memory_proxies.cc create mode 100644 PerfTools/MaxMemoryPreload/BuildFile.xml create mode 100644 PerfTools/MaxMemoryPreload/README.md create mode 100644 PerfTools/MaxMemoryPreload/src/preload.cc diff --git a/PerfTools/AllocMonitor/BuildFile.xml b/PerfTools/AllocMonitor/BuildFile.xml new file mode 100644 index 0000000000000..37051249a2261 --- /dev/null +++ b/PerfTools/AllocMonitor/BuildFile.xml @@ -0,0 +1,4 @@ + + + + diff --git a/PerfTools/AllocMonitor/README.md b/PerfTools/AllocMonitor/README.md new file mode 100644 index 0000000000000..a0552b6515958 --- /dev/null +++ b/PerfTools/AllocMonitor/README.md @@ -0,0 +1,76 @@ +# PerfTools/AllocMonitor Description + +## Introduction + +This package works with the PerfTools/AllocMonitorPreload package to provide a general facility to watch allocations and deallocations. +This is accomplished by using LD_PRELOAD with libPerfToolsAllocMonitorPreload.so and registering a class inheriting from `AllocMonotorBase` +with `AllocMonitorRegistry`. The preloaded library puts in proxies for the C and C++ allocation methods (and forwards the calls to the +original job methods). These proxies communicate with `AllocMonitorRegistry` which, in turn, call methods of the registered monitors. + +## Extending + +To add a new monitor, one inherits from `cms::perftools::AllocMonitorBase` and overrides the `allocCalled` and +`deallocCalled` methods. + +- `AllocMonitorBase::allocCalled(size_t iRequestedSize, size_t iActualSize)` : `iRequestedSize` is the number of bytes being requested by the allocation call. `iActualSize` is the actual number of bytes returned by the allocator. These can be different because of alignment constraints (e.g. asking for 1 byte but all allocations must be aligned on a particular memory boundary) or internal details of the allocator. + +- `AllocMonitorBase::deallocCalled(size_t iActualSize)` : `iActualSize` is the actual size returned when the associated allocation was made. NOTE: the glibc extended interface does not provide a way to find the requested size base on the address returned from an allocation, it only provides the actual size. + +When implementing `allocCalled` and `deallocCalled` it is perfectly fine to do allocations/deallocations. The facility +guarantees that those internal allocations will not cause any callbacks to be send to any active monitors. + + +To add a monitor to the facility, one must access the registry by calling the static method +`cms::perftools::AllocMonitorRegistry::instance()` and then call the member function +`T* createAndRegisterMonitor(ARGS&&... iArgs)`. The function will internally create a monitor of type `T` (being careful +to not cause callbacks during the allocation) and pass the arguments `iArgs` to the constructor. + +The monitor is owned by the registry and should not be deleted by any other code. If one needs to control the lifetime +of the monitor, one can call `cms::perftools::AllocMonitorRegistry::deregisterMonitor` to have the monitor removed from +the callback list and be deleted (again, without the deallocation causing any callbacks). + +## General usage + +To use the facility, one needs to use LD_PRELOAD to load in the memory proxies before the application runs, e.g. +``` +LD_PRELOAD=libPerfToolsAllocMonitorPreload.so cmsRun some_config_cfg.py +``` + +Internally, the program needs to register a monitor with the facility. When using `cmsRun` this can most easily be done +by loading a Service which setups a monitor. If one fails to do the LD_PRELOAD, then when the monitor is registered, the +facility will throw an exception. + +It is also possible to use LD_PRELOAD to load another library which auto registers a monitor even before the program +begins. See PerfTools/MaxMemoryPreload for an example. + +## Services + +### SimpleAllocMonitor +This service registers a monitor when the service is created (after python parsing is finished but before any modules +have been loaded into cmsRun) and reports its accumulated information when the service is destroyed (services are the +last plugins to be destroyed by cmsRun). The monitor reports +- Total amount of bytes requested by all allocation calls +- The maximum amount of _used_ (i.e actual size) allocated memory that was in use by the job at one time. +- Number of calls made to allocation functions while the monitor was running. +- Number of calls made to deallocation functions while the monitor was running. +This service is multi-thread safe. Note that when run multi-threaded the maximum reported value will vary from job to job. + + +### EventProcessingAllocMonitor +This service registers a monitor at the end of beginJob (after all modules have been loaded and setup) and reports its accumulated information at the beginning of endJob (after the event loop has finished but before any cleanup is done). This can be useful in understanding how memory is being used during the event loop. The monitor reports +- Total amount of bytes requested by all allocation calls during the event loop +- The maximum amount of _used_ (i.e. actual size) allocated memory that was in use in the event loop at one time. +- The amount of _used_ memory allocated during the loop that has yet to be reclaimed by calling deallocation. +- Number of calls made to allocation functions during the event loop. +- Number of calls made to deallocation functions during the event loop. +This service is multi-thread safe. Note that when run multi-threaded the maximum reported value will vary from job to job. + +### HistogrammingAllocMonitor +This service registers a monitor when the service is created (after python parsing is finished but before any modules +have been loaded into cmsRun) and reports its accumulated information when the service is destroyed (services are the +last plugins to be destroyed by cmsRun). The monitor histograms the values into bins of number of bytes where each +bin is a power of 2 larger than the previous. The histograms made are +- Amount of bytes requested by all allocation calls +- Amount of bytes actually used by all allocation calls +- Amount of bytes actually returned by all deallocation calls +This service is multi-thread safe. Note that when run multi-threaded the maximum reported value will vary from job to job. diff --git a/PerfTools/AllocMonitor/interface/AllocMonitorBase.h b/PerfTools/AllocMonitor/interface/AllocMonitorBase.h new file mode 100644 index 0000000000000..2a5000fc81b6a --- /dev/null +++ b/PerfTools/AllocMonitor/interface/AllocMonitorBase.h @@ -0,0 +1,50 @@ +#ifndef AllocMonitor_interface_AllocMonitorBase_h +#define AllocMonitor_interface_AllocMonitorBase_h +// -*- C++ -*- +// +// Package: AllocMonitor/interface +// Class : AllocMonitorBase +// +/**\class AllocMonitorBase AllocMonitorBase.h "AllocMonitorBase.h" + + Description: Base class for extensions that monitor allocations + + Usage: + The class is required to be thread safe as all member functions + will be called concurrently when used in a multi-threaded program. + + If allocations are done within the methods, no callbacks will be + generated as the underlying system will temporarily suspend such + calls on the thread running the method. + +*/ +// +// Original Author: Christopher Jones +// Created: Mon, 21 Aug 2023 14:03:34 GMT +// + +// system include files +#include //size_t + +// user include files + +// forward declarations + +namespace cms::perftools { + + class AllocMonitorBase { + public: + AllocMonitorBase(); + virtual ~AllocMonitorBase(); + + AllocMonitorBase(const AllocMonitorBase&) = delete; // stop default + AllocMonitorBase(AllocMonitorBase&&) = delete; // stop default + AllocMonitorBase& operator=(const AllocMonitorBase&) = delete; // stop default + AllocMonitorBase& operator=(AllocMonitorBase&&) = delete; // stop default + + // ---------- member functions --------------------------- + virtual void allocCalled(size_t iRequestedSize, size_t iActualSize) = 0; + virtual void deallocCalled(size_t iActualSize) = 0; + }; +} // namespace cms::perftools +#endif diff --git a/PerfTools/AllocMonitor/interface/AllocMonitorRegistry.h b/PerfTools/AllocMonitor/interface/AllocMonitorRegistry.h new file mode 100644 index 0000000000000..e2cef0354c6f4 --- /dev/null +++ b/PerfTools/AllocMonitor/interface/AllocMonitorRegistry.h @@ -0,0 +1,140 @@ +#ifndef PerfTools_AllocMonitor_AllocMonitorRegistry_h +#define PerfTools_AllocMonitor_AllocMonitorRegistry_h +// -*- C++ -*- +// +// Package: PerfTools/AllocMonitor +// Class : AllocMonitorRegistry +// +/**\class AllocMonitorRegistry AllocMonitorRegistry.h "AllocMonitorRegistry.h" + + Description: [one line class summary] + + Usage: + + +*/ +// +// Original Author: Christopher Jones +// Created: Mon, 21 Aug 2023 14:12:54 GMT +// + +// system include files +#include +#include +#include +#include + +// user include files +#include "AllocMonitorBase.h" + +// forward declarations + +namespace cms::perftools { + class AllocTester; + + class AllocMonitorRegistry { + public: + ~AllocMonitorRegistry(); + + AllocMonitorRegistry(AllocMonitorRegistry&&) = delete; // stop default + AllocMonitorRegistry(const AllocMonitorRegistry&) = delete; // stop default + AllocMonitorRegistry& operator=(const AllocMonitorRegistry&) = delete; // stop default + AllocMonitorRegistry& operator=(AllocMonitorRegistry&&) = delete; // stop default + + // ---------- static member functions -------------------- + static AllocMonitorRegistry& instance(); + + // ---------- member functions --------------------------- + template + T* createAndRegisterMonitor(ARGS&&... iArgs); + void deregisterMonitor(AllocMonitorBase*); + + private: + friend void* ::malloc(size_t) noexcept; + friend void* ::calloc(size_t, size_t) noexcept; + friend void* ::realloc(void*, size_t) noexcept; + friend void* ::aligned_alloc(size_t, size_t) noexcept; + friend void ::free(void*) noexcept; + + friend void* ::operator new(std::size_t size); + friend void* ::operator new[](std::size_t size); + friend void* ::operator new(std::size_t count, std::align_val_t al); + friend void* ::operator new[](std::size_t count, std::align_val_t al); + friend void* ::operator new(std::size_t count, const std::nothrow_t& tag) noexcept; + friend void* ::operator new[](std::size_t count, const std::nothrow_t& tag) noexcept; + friend void* ::operator new(std::size_t count, std::align_val_t al, const std::nothrow_t&) noexcept; + friend void* ::operator new[](std::size_t count, std::align_val_t al, const std::nothrow_t&) noexcept; + + friend void ::operator delete(void* ptr) noexcept; + friend void ::operator delete[](void* ptr) noexcept; + friend void ::operator delete(void* ptr, std::align_val_t al) noexcept; + friend void ::operator delete[](void* ptr, std::align_val_t al) noexcept; + friend void ::operator delete(void* ptr, std::size_t sz) noexcept; + friend void ::operator delete[](void* ptr, std::size_t sz) noexcept; + friend void ::operator delete(void* ptr, std::size_t sz, std::align_val_t al) noexcept; + friend void ::operator delete[](void* ptr, std::size_t sz, std::align_val_t al) noexcept; + friend void ::operator delete(void* ptr, const std::nothrow_t& tag) noexcept; + friend void ::operator delete[](void* ptr, const std::nothrow_t& tag) noexcept; + friend void ::operator delete(void* ptr, std::align_val_t al, const std::nothrow_t& tag) noexcept; + friend void ::operator delete[](void* ptr, std::align_val_t al, const std::nothrow_t& tag) noexcept; + + friend class AllocTester; + + // ---------- member data -------------------------------- + void start(); + bool& isRunning(); + + struct Guard { + explicit Guard(bool& iOriginal) noexcept : address_(&iOriginal), original_(iOriginal) { *address_ = false; } + ~Guard() { *address_ = original_; } + + bool running() const noexcept { return original_; } + + Guard(Guard const&) = delete; + Guard(Guard&&) = delete; + Guard& operator=(Guard const&) = delete; + Guard& operator=(Guard&&) = delete; + + bool* address_; + bool original_; + }; + + Guard makeGuard() { return Guard(isRunning()); } + + void allocCalled_(size_t, size_t); + void deallocCalled_(size_t); + + template + auto allocCalled(size_t iRequested, ALLOC iAlloc, ACT iGetActual) { + [[maybe_unused]] Guard g = makeGuard(); + auto a = iAlloc(); + if (g.running()) { + allocCalled_(iRequested, iGetActual(a)); + } + return a; + } + template + void deallocCalled(DEALLOC iDealloc, ACT iGetActual) { + [[maybe_unused]] Guard g = makeGuard(); + if (g.running()) { + deallocCalled_(iGetActual()); + } + iDealloc(); + } + + AllocMonitorRegistry(); + std::vector> monitors_; + }; + + template + T* AllocMonitorRegistry::createAndRegisterMonitor(ARGS&&... iArgs) { + [[maybe_unused]] Guard guard = makeGuard(); + start(); + + auto m = std::make_unique(std::forward(iArgs)...); + auto p = m.get(); + monitors_.push_back(std::move(m)); + return p; + } +} // namespace cms::perftools +#endif diff --git a/PerfTools/AllocMonitor/plugins/BuildFile.xml b/PerfTools/AllocMonitor/plugins/BuildFile.xml new file mode 100644 index 0000000000000..f7306fd1ae1f7 --- /dev/null +++ b/PerfTools/AllocMonitor/plugins/BuildFile.xml @@ -0,0 +1,3 @@ + + + diff --git a/PerfTools/AllocMonitor/plugins/EventProcessingAllocMonitor.cc b/PerfTools/AllocMonitor/plugins/EventProcessingAllocMonitor.cc new file mode 100644 index 0000000000000..f12565893cb25 --- /dev/null +++ b/PerfTools/AllocMonitor/plugins/EventProcessingAllocMonitor.cc @@ -0,0 +1,98 @@ +// -*- C++ -*- +// +// Package: PerfTools/AllocMonitor +// Class : EventProcessingAllocMonitor +// +// Implementation: +// [Notes on implementation] +// +// Original Author: Christopher Jones +// Created: Mon, 21 Aug 2023 20:31:57 GMT +// + +// system include files +#include + +// user include files +#include "PerfTools/AllocMonitor/interface/AllocMonitorBase.h" +#include "PerfTools/AllocMonitor/interface/AllocMonitorRegistry.h" +#include "FWCore/ServiceRegistry/interface/ServiceRegistry.h" +#include "FWCore/MessageLogger/interface/MessageLogger.h" +#include "FWCore/ServiceRegistry/interface/ServiceMaker.h" + +namespace { + class MonitorAdaptor : public cms::perftools::AllocMonitorBase { + public: + void performanceReport() { + started_.store(false, std::memory_order_release); + + auto finalRequested = requested_.load(std::memory_order_acquire); + auto maxActual = maxActual_.load(std::memory_order_acquire); + auto present = presentActual_.load(std::memory_order_acquire); + auto allocs = nAllocations_.load(std::memory_order_acquire); + auto deallocs = nDeallocations_.load(std::memory_order_acquire); + + edm::LogSystem("EventProcessingAllocMonitor") + << "Event Processing Memory Report" + << "\n total memory requested: " << finalRequested << "\n max memory used: " << maxActual + << "\n total memory not deallocated: " << present << "\n # allocations calls: " << allocs + << "\n # deallocations calls: " << deallocs; + } + + void start() { started_.store(true, std::memory_order_release); } + + private: + void allocCalled(size_t iRequested, size_t iActual) final { + if (not started_.load(std::memory_order_acquire)) { + return; + } + nAllocations_.fetch_add(1, std::memory_order_acq_rel); + requested_.fetch_add(iRequested, std::memory_order_acq_rel); + + //returns previous value + auto a = presentActual_.fetch_add(iActual, std::memory_order_acq_rel); + a += iActual; + + auto max = maxActual_.load(std::memory_order_relaxed); + while (a > max) { + if (maxActual_.compare_exchange_strong(max, a, std::memory_order_acq_rel)) { + break; + } + } + } + void deallocCalled(size_t iActual) final { + if (not started_.load(std::memory_order_acquire)) { + return; + } + nDeallocations_.fetch_add(1, std::memory_order_acq_rel); + auto present = presentActual_.load(std::memory_order_acquire); + if (present >= iActual) { + presentActual_.fetch_sub(iActual, std::memory_order_acq_rel); + } + } + + std::atomic requested_ = 0; + std::atomic presentActual_ = 0; + std::atomic maxActual_ = 0; + std::atomic nAllocations_ = 0; + std::atomic nDeallocations_ = 0; + + std::atomic started_ = false; + }; + +} // namespace + +class EventProcessingAllocMonitor { +public: + EventProcessingAllocMonitor(edm::ParameterSet const& iPS, edm::ActivityRegistry& iAR) { + auto adaptor = cms::perftools::AllocMonitorRegistry::instance().createAndRegisterMonitor(); + ; + iAR.postBeginJobSignal_.connect([adaptor]() { adaptor->start(); }); + iAR.preEndJobSignal_.connect([adaptor]() { + adaptor->performanceReport(); + cms::perftools::AllocMonitorRegistry::instance().deregisterMonitor(adaptor); + }); + } +}; + +DEFINE_FWK_SERVICE(EventProcessingAllocMonitor); diff --git a/PerfTools/AllocMonitor/plugins/HistogrammingAllocMonitor.cc b/PerfTools/AllocMonitor/plugins/HistogrammingAllocMonitor.cc new file mode 100644 index 0000000000000..45cd32171e1c4 --- /dev/null +++ b/PerfTools/AllocMonitor/plugins/HistogrammingAllocMonitor.cc @@ -0,0 +1,87 @@ +// -*- C++ -*- +// +// Package: PerfTools/AllocMonitor +// Class : HistogrammingAllocMonitor +// +// Implementation: +// [Notes on implementation] +// +// Original Author: Christopher Jones +// Created: Mon, 21 Aug 2023 20:31:57 GMT +// + +// system include files +#include + +// user include files +#include "PerfTools/AllocMonitor/interface/AllocMonitorBase.h" +#include "PerfTools/AllocMonitor/interface/AllocMonitorRegistry.h" +#include "FWCore/ServiceRegistry/interface/ServiceRegistry.h" +#include "FWCore/MessageLogger/interface/MessageLogger.h" +#include "FWCore/ServiceRegistry/interface/ServiceMaker.h" + +namespace { + class MonitorAdaptor : public cms::perftools::AllocMonitorBase { + public: + void allocCalled(size_t iRequested, size_t iActual) final { + auto& a = allocRequested_[bin(iRequested)]; + a.fetch_add(1, std::memory_order_acq_rel); + + auto& u = allocUsed_[bin(iActual)]; + u.fetch_add(1, std::memory_order_acq_rel); + } + void deallocCalled(size_t iActual) final { + auto& u = deallocUsed_[bin(iActual)]; + u.fetch_add(1, std::memory_order_acq_rel); + } + + void performanceReport() const { + auto log = edm::LogSystem("HistogrammingAllocMonitor"); + log << "Memory Histogram" + << "\n size allocated deallocated" + << "\n requested used used"; + size_t size = 0; + for (unsigned int i = 0; i < allocRequested_.size(); ++i) { + log << "\n" + << std::setw(12) << size << " " << std::setw(12) << allocRequested_[i] << " " << std::setw(12) + << allocUsed_[i] << " " << std::setw(12) << deallocUsed_[i]; + if (size == 0) { + size = 1; + } else { + size *= 2; + } + } + } + + private: + static size_t bin(size_t iValue) { + size_t i = 0; + + while (iValue != 0) { + ++i; + iValue /= 2; + } + return i; + }; + + std::array, 40> allocRequested_; + std::array, 40> allocUsed_; + std::array, 40> deallocUsed_; + }; + +} // namespace + +class HistogrammingAllocMonitor { +public: + HistogrammingAllocMonitor() + : adaptor_(cms::perftools::AllocMonitorRegistry::instance().createAndRegisterMonitor()) {} + + ~HistogrammingAllocMonitor() { + adaptor_->performanceReport(); + cms::perftools::AllocMonitorRegistry::instance().deregisterMonitor(adaptor_); + } + + MonitorAdaptor* adaptor_; +}; + +DEFINE_FWK_SERVICE_MAKER(HistogrammingAllocMonitor, edm::serviceregistry::NoArgsMaker); diff --git a/PerfTools/AllocMonitor/plugins/SimpleAllocMonitor.cc b/PerfTools/AllocMonitor/plugins/SimpleAllocMonitor.cc new file mode 100644 index 0000000000000..ecaaae551f9e4 --- /dev/null +++ b/PerfTools/AllocMonitor/plugins/SimpleAllocMonitor.cc @@ -0,0 +1,83 @@ +// -*- C++ -*- +// +// Package: PerfTools/AllocMonitor +// Class : SimpleAllocMonitor +// +// Implementation: +// [Notes on implementation] +// +// Original Author: Christopher Jones +// Created: Mon, 21 Aug 2023 20:31:57 GMT +// + +// system include files +#include + +// user include files +#include "PerfTools/AllocMonitor/interface/AllocMonitorBase.h" +#include "PerfTools/AllocMonitor/interface/AllocMonitorRegistry.h" +#include "FWCore/ServiceRegistry/interface/ServiceRegistry.h" +#include "FWCore/MessageLogger/interface/MessageLogger.h" +#include "FWCore/ServiceRegistry/interface/ServiceMaker.h" + +namespace { + class MonitorAdaptor : public cms::perftools::AllocMonitorBase { + public: + void allocCalled(size_t iRequested, size_t iActual) final { + nAllocations_.fetch_add(1, std::memory_order_acq_rel); + requested_.fetch_add(iRequested, std::memory_order_acq_rel); + + //returns previous value + auto a = presentActual_.fetch_add(iActual, std::memory_order_acq_rel); + a += iActual; + auto max = maxActual_.load(std::memory_order_relaxed); + while (a > max) { + if (maxActual_.compare_exchange_strong(max, a, std::memory_order_acq_rel)) { + break; + } + } + } + void deallocCalled(size_t iActual) final { + nDeallocations_.fetch_add(1, std::memory_order_acq_rel); + auto present = presentActual_.load(std::memory_order_acquire); + if (present >= iActual) { + presentActual_.fetch_sub(iActual, std::memory_order_acq_rel); + } + } + + void performanceReport() const { + auto finalRequested = requested_.load(std::memory_order_acquire); + auto maxActual = maxActual_.load(std::memory_order_acquire); + auto allocs = nAllocations_.load(std::memory_order_acquire); + auto deallocs = nDeallocations_.load(std::memory_order_acquire); + + edm::LogSystem("SimpleAllocMonitor") + << "Memory Report" + << "\n total memory requested: " << finalRequested << "\n max memory used: " << maxActual + << "\n # allocations calls: " << allocs << "\n # deallocations calls: " << deallocs; + } + + private: + std::atomic requested_ = 0; + std::atomic presentActual_ = 0; + std::atomic maxActual_ = 0; + std::atomic nAllocations_ = 0; + std::atomic nDeallocations_ = 0; + }; + +} // namespace + +class SimpleAllocMonitor { +public: + SimpleAllocMonitor() + : adaptor_(cms::perftools::AllocMonitorRegistry::instance().createAndRegisterMonitor()) {} + + ~SimpleAllocMonitor() { + adaptor_->performanceReport(); + cms::perftools::AllocMonitorRegistry::instance().deregisterMonitor(adaptor_); + } + + MonitorAdaptor* adaptor_; +}; + +DEFINE_FWK_SERVICE_MAKER(SimpleAllocMonitor, edm::serviceregistry::NoArgsMaker); diff --git a/PerfTools/AllocMonitor/src/AllocMonitorBase.cc b/PerfTools/AllocMonitor/src/AllocMonitorBase.cc new file mode 100644 index 0000000000000..fd438edf7a2b5 --- /dev/null +++ b/PerfTools/AllocMonitor/src/AllocMonitorBase.cc @@ -0,0 +1,38 @@ +// -*- C++ -*- +// +// Package: PerfTools/AllocMonitor +// Class : AllocMonitorBase +// +// Implementation: +// [Notes on implementation] +// +// Original Author: Christopher Jones +// Created: Mon, 21 Aug 2023 15:42:57 GMT +// + +// system include files + +// user include files +#include "PerfTools/AllocMonitor/interface/AllocMonitorBase.h" + +// +// constants, enums and typedefs +// + +// +// static data member definitions +// + +using namespace cms::perftools; + +// +// constructors and destructor +// +AllocMonitorBase::AllocMonitorBase() {} + +// AllocMonitorBase::AllocMonitorBase(const AllocMonitorBase& rhs) +// { +// // do actual copying here; +// } + +AllocMonitorBase::~AllocMonitorBase() {} diff --git a/PerfTools/AllocMonitor/src/AllocMonitorRegistry.cc b/PerfTools/AllocMonitor/src/AllocMonitorRegistry.cc new file mode 100644 index 0000000000000..b8f9c06613f20 --- /dev/null +++ b/PerfTools/AllocMonitor/src/AllocMonitorRegistry.cc @@ -0,0 +1,108 @@ +// -*- C++ -*- +// +// Package: PerfTools/AllocMonitor +// Class : AllocMonitorRegistry +// +// Implementation: +// [Notes on implementation] +// +// Original Author: Christopher Jones +// Created: Mon, 21 Aug 2023 15:42:48 GMT +// + +// system include files +#include // dlsym + +// user include files +#include "PerfTools/AllocMonitor/interface/AllocMonitorRegistry.h" +#include "FWCore/Utilities/interface/Exception.h" + +// +// constants, enums and typedefs +// +extern "C" { +void alloc_monitor_start(); +void alloc_monitor_stop(); +} + +namespace { + bool& threadRunning() { + static thread_local bool s_running = true; + return s_running; + } +} // namespace + +using namespace cms::perftools; + +// +// static data member definitions +// + +// +// constructors and destructor +// +AllocMonitorRegistry::AllocMonitorRegistry() { + threadRunning() = true; + //Cannot start here because statics can cause memory to be allocated in the atexit registration + // done behind the scenes. If the allocation happens, AllocMonitorRegistry::instance will be called + // recursively before the static has finished and we well deadlock +} + +AllocMonitorRegistry::~AllocMonitorRegistry() { + void* stop = dlsym(RTLD_DEFAULT, "alloc_monitor_stop"); + if (stop != nullptr) { + auto s = reinterpret_cast(stop); + s(); + } + threadRunning() = false; + monitors_.clear(); +} + +// +// member functions +// +void AllocMonitorRegistry::start() { + if (monitors_.empty()) { + void* start = dlsym(RTLD_DEFAULT, "alloc_monitor_start"); + if (start == nullptr) { + throw cms::Exception("NoAllocMonitorPreload") + << "The libPerfToolsAllocMonitorPreload.so was not LD_PRELOADed into the job"; + } + auto s = reinterpret_cast(start); + s(); + } +} + +bool& AllocMonitorRegistry::isRunning() { return threadRunning(); } + +void AllocMonitorRegistry::deregisterMonitor(AllocMonitorBase* iMonitor) { + for (auto it = monitors_.begin(); monitors_.end() != it; ++it) { + if (it->get() == iMonitor) { + [[maybe_unused]] Guard g = makeGuard(); + monitors_.erase(it); + break; + } + } +} + +// +// const member functions +// +void AllocMonitorRegistry::allocCalled_(size_t iRequested, size_t iActual) { + for (auto& m : monitors_) { + m->allocCalled(iRequested, iActual); + } +} +void AllocMonitorRegistry::deallocCalled_(size_t iActual) { + for (auto& m : monitors_) { + m->deallocCalled(iActual); + } +} + +// +// static member functions +// +AllocMonitorRegistry& AllocMonitorRegistry::instance() { + static AllocMonitorRegistry s_registry; + return s_registry; +} diff --git a/PerfTools/AllocMonitor/test/BuildFile.xml b/PerfTools/AllocMonitor/test/BuildFile.xml new file mode 100644 index 0000000000000..1dcbe2a244a32 --- /dev/null +++ b/PerfTools/AllocMonitor/test/BuildFile.xml @@ -0,0 +1,10 @@ + + + + + + + + + + diff --git a/PerfTools/AllocMonitor/test/test_catch2_AllocMonitorRegistry.cc b/PerfTools/AllocMonitor/test/test_catch2_AllocMonitorRegistry.cc new file mode 100644 index 0000000000000..0b80dfa5b75f6 --- /dev/null +++ b/PerfTools/AllocMonitor/test/test_catch2_AllocMonitorRegistry.cc @@ -0,0 +1,177 @@ +#define CATCH_CONFIG_MAIN +#include "catch.hpp" + +#include "PerfTools/AllocMonitor/interface/AllocMonitorRegistry.h" + +namespace cms::perftools { + class AllocTester { + public: + void callAlloc(size_t iRequested, size_t iActual) { + reg_.allocCalled( + iRequested, []() { return nullptr; }, [iActual](auto) { return iActual; }); + } + + void callDealloc(size_t iActual) { + reg_.deallocCalled([]() {}, [iActual]() { return iActual; }); + } + + template + void callAlloc(size_t iRequested, size_t iActual, A&& iAlloc) { + reg_.allocCalled(iRequested, std::forward(iAlloc), [iActual](auto) { return iActual; }); + } + + template + void callDealloc(size_t iActual, D&& iDealloc) { + reg_.deallocCalled(std::forward(iDealloc), [iActual]() { return iActual; }); + } + + AllocMonitorRegistry reg_; + }; +} // namespace cms::perftools + +using namespace cms::perftools; + +namespace { + int s_calls = 0; + + class TestCallMonitor : public AllocMonitorBase { + public: + TestCallMonitor(int) { ++s_calls; } + + ~TestCallMonitor() override { ++s_calls; } + + void allocCalled(size_t iRequestedSize, size_t iActualSize) final { ++s_calls; } + void deallocCalled(size_t iActualSize) final { ++s_calls; } + }; + + bool s_started = false; + bool s_stopped = false; + + class TestRecursionMonitor : public AllocMonitorBase { + public: + TestRecursionMonitor(AllocTester* iTester) : tester_(iTester) { + ++s_calls; + tester_->callAlloc(1, 1); + } + + ~TestRecursionMonitor() override { + ++s_calls; + tester_->callDealloc(1); + } + + void allocCalled(size_t iRequestedSize, size_t iActualSize) final { + ++s_calls; + tester_->callAlloc(1, 1); + tester_->callDealloc(1); + } + void deallocCalled(size_t iActualSize) final { + ++s_calls; + tester_->callAlloc(1, 1); + tester_->callDealloc(1); + } + + private: + AllocTester* tester_; + }; +} // namespace + +extern "C" { +void alloc_monitor_start() { s_started = true; } +void alloc_monitor_stop() { s_stopped = true; } +} + +TEST_CASE("Test API for AllocMonitorRegistry", "[AllocMonitorRegistry]") { + SECTION("Calls Check") { + CHECK(0 == s_calls); + CHECK(s_started == false); + CHECK(s_stopped == false); + { + AllocTester t; + CHECK(s_started == false); + CHECK(s_stopped == false); + + auto tester = t.reg_.createAndRegisterMonitor(1); + CHECK(s_started == true); + CHECK(s_stopped == false); + CHECK(1 == s_calls); + CHECK(tester != nullptr); + + t.callAlloc(1, 1); + CHECK(2 == s_calls); + + t.callDealloc(1); + CHECK(3 == s_calls); + + t.reg_.deregisterMonitor(tester); + CHECK(4 == s_calls); + } + CHECK(4 == s_calls); + CHECK(s_stopped == true); + s_started = false; + s_stopped = false; + s_calls = 0; + } + SECTION("Recursion in monitor") { + CHECK(0 == s_calls); + CHECK(s_started == false); + CHECK(s_stopped == false); + { + AllocTester t; + CHECK(s_started == false); + CHECK(s_stopped == false); + + auto tester = t.reg_.createAndRegisterMonitor(&t); + CHECK(s_started == true); + CHECK(s_stopped == false); + CHECK(1 == s_calls); + CHECK(tester != nullptr); + + t.callAlloc(1, 1); + CHECK(2 == s_calls); + + t.callDealloc(1); + CHECK(3 == s_calls); + + t.reg_.deregisterMonitor(tester); + CHECK(4 == s_calls); + } + CHECK(4 == s_calls); + CHECK(s_stopped == true); + s_started = false; + s_stopped = false; + s_calls = 0; + } + SECTION("System calling system") { + CHECK(0 == s_calls); + CHECK(s_started == false); + CHECK(s_stopped == false); + { + AllocTester t; + CHECK(s_started == false); + CHECK(s_stopped == false); + + auto tester = t.reg_.createAndRegisterMonitor(1); + CHECK(s_started == true); + CHECK(s_stopped == false); + CHECK(1 == s_calls); + CHECK(tester != nullptr); + + t.callAlloc(1, 1, [&t]() { + t.callAlloc(1, 1); + return 1; + }); + CHECK(2 == s_calls); + + t.callDealloc(1, [&t]() { t.callDealloc(1); }); + CHECK(3 == s_calls); + + t.reg_.deregisterMonitor(tester); + CHECK(4 == s_calls); + } + CHECK(4 == s_calls); + CHECK(s_stopped == true); + s_started = false; + s_stopped = false; + s_calls = 0; + } +} diff --git a/PerfTools/AllocMonitor/test/test_proxies.cc b/PerfTools/AllocMonitor/test/test_proxies.cc new file mode 100644 index 0000000000000..ec42133cf577c --- /dev/null +++ b/PerfTools/AllocMonitor/test/test_proxies.cc @@ -0,0 +1,199 @@ +#include "PerfTools/AllocMonitor/interface/AllocMonitorBase.h" +#include "PerfTools/AllocMonitor/interface/AllocMonitorRegistry.h" + +#include +#include + +using namespace cms::perftools; + +namespace { + class TestMonitor : public AllocMonitorBase { + public: + TestMonitor(size_t& iRequested, size_t& iTotal) : requested_(iRequested), total_(iTotal) {} + + void allocCalled(size_t iRequested, size_t iActual) { + requested_ = iRequested; + total_ += iActual; + } + + void deallocCalled(size_t iActual) { total_ -= iActual; } + + private: + size_t& requested_; + size_t& total_; + }; +} // namespace + +int main() { + size_t requested = 0; + size_t total = 0; + + { + auto monitor = AllocMonitorRegistry::instance().createAndRegisterMonitor(requested, total); + if (requested != 0) { + std::cout << "Memory requested during monitor creation"; + exit(1); + } + { + [[maybe_unused]] auto i = std::make_unique(1); + if (requested != sizeof(int)) { + std::cout << "int request size wrong, got " << requested << " expected " << sizeof(int); + exit(1); + } + } + if (total != 0) { + std::cout << "int request not cleaned up"; + exit(1); + } + + { + [[maybe_unused]] auto i = new int[5]; + if (requested != sizeof(int) * 5) { + std::cout << "int request size wrong, got " << requested << " expected " << sizeof(int) * 5; + exit(1); + } + delete[] i; + } + if (total != 0) { + std::cout << "int request not cleaned up"; + exit(1); + } + + { + [[maybe_unused]] auto i = new (std::align_val_t{512}) int; + if (requested != sizeof(int)) { + std::cout << "int request size wrong, got " << requested << " expected " << sizeof(int); + exit(1); + } + delete i; + } + if (total != 0) { + std::cout << "int request not cleaned up"; + exit(1); + } + + { + [[maybe_unused]] auto i = new (std::align_val_t{512}) int[5]; + if (requested != sizeof(int) * 5) { + std::cout << "int request size wrong, got " << requested << " expected " << sizeof(int) * 5; + exit(1); + } + delete[] i; + } + if (total != 0) { + std::cout << "int request not cleaned up"; + exit(1); + } + + { + [[maybe_unused]] auto i = new (std::align_val_t{512}, std::nothrow) int; + if (requested != sizeof(int)) { + std::cout << "int request size wrong, got " << requested << " expected " << sizeof(int); + exit(1); + } + delete i; + } + if (total != 0) { + std::cout << "int request not cleaned up"; + exit(1); + } + + { + [[maybe_unused]] auto i = new (std::align_val_t{512}, std::nothrow) int[5]; + if (requested != sizeof(int) * 5) { + std::cout << "int request size wrong, got " << requested << " expected " << sizeof(int) * 5; + exit(1); + } + delete[] i; + } + if (total != 0) { + std::cout << "int request not cleaned up"; + exit(1); + } + + { + [[maybe_unused]] auto i = new (std::nothrow) int; + if (requested != sizeof(int)) { + std::cout << "int request size wrong, got " << requested << " expected " << sizeof(int); + exit(1); + } + delete i; + } + if (total != 0) { + std::cout << "int request not cleaned up"; + exit(1); + } + + { + [[maybe_unused]] auto i = new (std::nothrow) int[5]; + if (requested != sizeof(int) * 5) { + std::cout << "int request size wrong, got " << requested << " expected " << sizeof(int) * 5; + exit(1); + } + delete[] i; + } + if (total != 0) { + std::cout << "int request not cleaned up"; + exit(1); + } + + { + auto p = calloc(12, 1); + assert(p != nullptr); + { + auto r = requested; + if (r != 12) { + std::cout << "calloc request size wrong, got " << r << " expected " << 12; + exit(1); + } + } + free(p); + if (total != 0) { + std::cout << "calloc request not cleaned up"; + exit(1); + } + + p = malloc(50); + assert(p != nullptr); + { + auto r = requested; + if (r != 50) { + std::cout << "malloc request size wrong, got " << r << " expected " << 50; + exit(1); + } + } + p = realloc(p, 100); + assert(p != nullptr); + auto r = requested; + if (r != 100) { + std::cout << "realloc request size wrong, got " << r << " expected " << 100; + exit(1); + } + if (total < 100) { + auto t = total; + std::cout << "realloc request total too small " << t; + exit(1); + } + + free(p); + if (total != 0) { + auto t = total; + std::cout << "free after realloc request not cleaned up, still have " << t; + exit(1); + } + + p = aligned_alloc(128, 32); + if (requested != 32) { + auto r = requested; + std::cout << "aligned_alloc request size wrong, got " << r << " expected " << 32; + exit(1); + } + free(p); + if (total != 0) { + std::cout << "aligned_alloc request not cleaned up"; + exit(1); + } + } + AllocMonitorRegistry::instance().deregisterMonitor(monitor); + } +} diff --git a/PerfTools/AllocMonitorPreload/BuildFile.xml b/PerfTools/AllocMonitorPreload/BuildFile.xml new file mode 100644 index 0000000000000..37e3b6bb6fecd --- /dev/null +++ b/PerfTools/AllocMonitorPreload/BuildFile.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/PerfTools/AllocMonitorPreload/README.md b/PerfTools/AllocMonitorPreload/README.md new file mode 100644 index 0000000000000..e1c6a69939691 --- /dev/null +++ b/PerfTools/AllocMonitorPreload/README.md @@ -0,0 +1,30 @@ +# PerfTools/AllocMonitorPreload Description + +## Introduction + +This package works with the PerfTools/AllocMonitor package to provide a general facility to watch allocations and deallocations. See the README.md in that package for details on how to use the facility. + +## Technical Details + +This package overrides the standard C and C++ allocation and deallocation functions. These overridden functions call the +appropriate methods of `cms::perftools::AllocMonitorRegistry` to allow monitoring of memory allocations and +deallocations. The overridden C and C++ standard methods use `dlsym` to find the original functions (i.e. what ever +versions of those methods were compiled into the executable) and then call those original functions to do the actual +allocation/deallocation. + +To support standard library C++ on linux, one only needs to override the standard C methods to intercept all +allocations/deallocations. However, to intercept calls for jemalloc or tcmalloc, one must also override the C++ +methods. This is complicated as one must call `dlsym` using the _mangled_ names of the C++ methods. As the exact +mangled name can be different on different operating systems or CPU types the exact name used will have to be updated +in this code to use different systems. `AllocMonitorRegistry` makes sure that if a standard function calls another +standard function to do the actual work, that only one callback will be issued. + +There is no C or C++ standard method one can call to ask how much actual memory is associated with a given address +returned by an allocator. To provide such information, we use the GNU standard `malloc_usable_size` method. To have +this facility support additional operating systems, an equivalent method would be needed. + +The facility starts and stops calls to the `cms::perftools::AllocMonitorRegistry` via the use of the functions `alloc_monitor_start` and `alloc_monitor_stop`. The `AllocMonitorRegistry` use `dlsym` to locate these methods (avoiding link +time dependencies with this package) and if the method is not available by the first request to register a monitor the +code throws an exception. The destructor of `AllocMonitorRegistry` calls the stop method. In this way, the facility can +never call methods on `AllocMonitorRegistry` when the registry is not available. + diff --git a/PerfTools/AllocMonitorPreload/src/memory_proxies.cc b/PerfTools/AllocMonitorPreload/src/memory_proxies.cc new file mode 100644 index 0000000000000..b7f230ca72272 --- /dev/null +++ b/PerfTools/AllocMonitorPreload/src/memory_proxies.cc @@ -0,0 +1,338 @@ +#include +#include +#include +#include + +#include "PerfTools/AllocMonitor/interface/AllocMonitorRegistry.h" + +#include // dlsym + +namespace { + + std::atomic& alloc_monitor_running_state() { + static std::atomic s_state = false; + return s_state; + } + + template + T get(const char* iName) { + void* original = dlsym(RTLD_NEXT, iName); + assert(original); + return reinterpret_cast(original); + } + +} // namespace + +using namespace cms::perftools; +extern "C" { +void alloc_monitor_start() { alloc_monitor_running_state() = true; } +void alloc_monitor_stop() { alloc_monitor_running_state() = false; } + +//---------------------------------------------------------------- +//C memory functions + +void* malloc(size_t size) noexcept { + static auto original = get("malloc"); + if (not alloc_monitor_running_state()) { + return original(size); + } + auto& reg = AllocMonitorRegistry::instance(); + return reg.allocCalled( + size, [size]() { return original(size); }, [](auto ret) { return malloc_usable_size(ret); }); +} + +void* calloc(size_t nitems, size_t item_size) noexcept { + static auto original = get("calloc"); + + if (not alloc_monitor_running_state()) { + return original(nitems, item_size); + } + auto& reg = AllocMonitorRegistry::instance(); + return reg.allocCalled( + nitems * item_size, + [nitems, item_size]() { return original(nitems, item_size); }, + [](auto ret) { return malloc_usable_size(ret); }); +} + +void* realloc(void* ptr, size_t size) noexcept { + static auto original = get("realloc"); + if (not alloc_monitor_running_state()) { + return original(ptr, size); + } + size_t oldsize = malloc_usable_size(ptr); + void* ret; + auto& reg = AllocMonitorRegistry::instance(); + { + //incase this calls malloc/free + [[maybe_unused]] auto g = reg.makeGuard(); + ret = original(ptr, size); + } + size_t used = malloc_usable_size(ret); + if (used != oldsize) { + reg.deallocCalled([]() {}, [oldsize]() { return oldsize; }); + reg.allocCalled( + size, []() { return nullptr; }, [used](auto) { return used; }); + } + return ret; +} + +void* aligned_alloc(size_t alignment, size_t size) noexcept { + static auto original = get("aligned_alloc"); + if (not alloc_monitor_running_state()) { + return original(alignment, size); + } + + auto& reg = AllocMonitorRegistry::instance(); + return reg.allocCalled( + size, + [alignment, size]() { return original(alignment, size); }, + [](auto ret) { return malloc_usable_size(ret); }); +} + +void free(void* ptr) noexcept { + static auto original = get("free"); + if (not alloc_monitor_running_state()) { + original(ptr); + return; + } + + auto& reg = AllocMonitorRegistry::instance(); + reg.deallocCalled([ptr]() { original(ptr); }, [ptr]() { return malloc_usable_size(ptr); }); +} +} + +//---------------------------------------------------------------- +//C++ memory functions + +#define CPP_MEM_OVERRIDE + +#if defined(CPP_MEM_OVERRIDE) +#include + +void* operator new(std::size_t size) { + static auto original = get("_Znwm"); + if (not alloc_monitor_running_state()) { + return original(size); + } + + auto& reg = AllocMonitorRegistry::instance(); + return reg.allocCalled( + size, [size]() { return original(size); }, [](auto ret) { return malloc_usable_size(ret); }); +} //_Znwm + +void operator delete(void* ptr) noexcept { + static auto original = get("_ZdlPv"); + if (not alloc_monitor_running_state()) { + original(ptr); + return; + } + + auto& reg = AllocMonitorRegistry::instance(); + reg.deallocCalled([ptr]() { original(ptr); }, [ptr]() { return malloc_usable_size(ptr); }); +} //_ZdlPv + +void* operator new[](std::size_t size) { + static auto original = get("_Znam"); + if (not alloc_monitor_running_state()) { + return original(size); + } + + auto& reg = AllocMonitorRegistry::instance(); + return reg.allocCalled( + size, [size]() { return original(size); }, [](auto ret) { return malloc_usable_size(ret); }); +} //_Znam + +void operator delete[](void* ptr) noexcept { + static auto original = get("_ZdaPv"); + + if (not alloc_monitor_running_state()) { + original(ptr); + return; + } + auto& reg = AllocMonitorRegistry::instance(); + reg.deallocCalled([ptr]() { original(ptr); }, [ptr]() { return malloc_usable_size(ptr); }); +} //_ZdaPv + +void* operator new(std::size_t size, std::align_val_t al) { + static auto original = get("_ZnwmSt11align_val_t"); + if (not alloc_monitor_running_state()) { + return original(size, al); + } + + auto& reg = AllocMonitorRegistry::instance(); + return reg.allocCalled( + size, [size, al]() { return original(size, al); }, [](auto ret) { return malloc_usable_size(ret); }); +} //_ZnwmSt11align_val_t + +void* operator new[](std::size_t size, std::align_val_t al) { + static auto original = get("_ZnamSt11align_val_t"); + + if (not alloc_monitor_running_state()) { + return original(size, al); + } + + auto& reg = AllocMonitorRegistry::instance(); + return reg.allocCalled( + size, [size, al]() { return original(size, al); }, [](auto ret) { return malloc_usable_size(ret); }); +} //_ZnamSt11align_val_t + +void* operator new(std::size_t size, const std::nothrow_t& tag) noexcept { + static auto original = get("_ZnwmRKSt9nothrow_t"); + + if (not alloc_monitor_running_state()) { + return original(size, tag); + } + + auto& reg = AllocMonitorRegistry::instance(); + return reg.allocCalled( + size, [size, &tag]() { return original(size, tag); }, [](auto ret) { return malloc_usable_size(ret); }); +} //_ZnwmRKSt9nothrow_t + +void* operator new[](std::size_t size, const std::nothrow_t& tag) noexcept { + static auto original = get("_ZnamRKSt9nothrow_t"); + + if (not alloc_monitor_running_state()) { + return original(size, tag); + } + + auto& reg = AllocMonitorRegistry::instance(); + return reg.allocCalled( + size, [size, &tag]() { return original(size, tag); }, [](auto ret) { return malloc_usable_size(ret); }); +} //_ZnamRKSt9nothrow_t + +void* operator new(std::size_t size, std::align_val_t al, const std::nothrow_t& tag) noexcept { + static auto original = get( + "_ZnwmSt11align_val_tRKSt9nothrow_t"); + + if (not alloc_monitor_running_state()) { + return original(size, al, tag); + } + + auto& reg = AllocMonitorRegistry::instance(); + return reg.allocCalled( + size, [size, al, &tag]() { return original(size, al, tag); }, [](auto ret) { return malloc_usable_size(ret); }); +} //_ZnwmSt11align_val_tRKSt9nothrow_t + +void* operator new[](std::size_t size, std::align_val_t al, const std::nothrow_t& tag) noexcept { + static auto original = get( + "_ZnamSt11align_val_tRKSt9nothrow_t"); + + if (not alloc_monitor_running_state()) { + return original(size, al, tag); + } + + auto& reg = AllocMonitorRegistry::instance(); + return reg.allocCalled( + size, [size, al, &tag]() { return original(size, al, tag); }, [](auto ret) { return malloc_usable_size(ret); }); +} //_ZnamSt11align_val_tRKSt9nothrow_t + +void operator delete(void* ptr, std::align_val_t al) noexcept { + static auto original = get("_ZdlPvSt11align_val_t"); + + if (not alloc_monitor_running_state()) { + original(ptr, al); + return; + } + auto& reg = AllocMonitorRegistry::instance(); + reg.deallocCalled([ptr, al]() { original(ptr, al); }, [ptr]() { return malloc_usable_size(ptr); }); +} //_ZdlPvSt11align_val_t + +void operator delete[](void* ptr, std::align_val_t al) noexcept { + static auto original = get("_ZdaPvSt11align_val_t"); + + if (not alloc_monitor_running_state()) { + original(ptr, al); + return; + } + auto& reg = AllocMonitorRegistry::instance(); + reg.deallocCalled([ptr, al]() { original(ptr, al); }, [ptr]() { return malloc_usable_size(ptr); }); +} //_ZdaPvSt11align_val_t + +void operator delete(void* ptr, std::size_t sz) noexcept { + static auto original = get("_ZdlPvm"); + + if (not alloc_monitor_running_state()) { + original(ptr, sz); + return; + } + auto& reg = AllocMonitorRegistry::instance(); + reg.deallocCalled([ptr, sz]() { original(ptr, sz); }, [ptr]() { return malloc_usable_size(ptr); }); +} //_ZdlPvm + +void operator delete[](void* ptr, std::size_t sz) noexcept { + static auto original = get("_ZdaPvm"); + + if (not alloc_monitor_running_state()) { + original(ptr, sz); + return; + } + auto& reg = AllocMonitorRegistry::instance(); + reg.deallocCalled([ptr, sz]() { original(ptr, sz); }, [ptr]() { return malloc_usable_size(ptr); }); +} //_ZdaPvm + +void operator delete(void* ptr, std::size_t sz, std::align_val_t al) noexcept { + static auto original = get("_ZdlPvmSt11align_val_t"); + + if (not alloc_monitor_running_state()) { + original(ptr, sz, al); + return; + } + auto& reg = AllocMonitorRegistry::instance(); + reg.deallocCalled([ptr, sz, al]() { original(ptr, sz, al); }, [ptr]() { return malloc_usable_size(ptr); }); +} //_ZdlPvmSt11align_val_t +void operator delete[](void* ptr, std::size_t sz, std::align_val_t al) noexcept { + static auto original = get("_ZdaPvmSt11align_val_t"); + + if (not alloc_monitor_running_state()) { + original(ptr, sz, al); + return; + } + auto& reg = AllocMonitorRegistry::instance(); + reg.deallocCalled([ptr, sz, al]() { original(ptr, sz, al); }, [ptr]() { return malloc_usable_size(ptr); }); +} //_ZdaPvmSt11align_val_t + +void operator delete(void* ptr, const std::nothrow_t& tag) noexcept { + static auto original = get("_ZdlPvRKSt9nothrow_t"); + + if (not alloc_monitor_running_state()) { + original(ptr, tag); + return; + } + auto& reg = AllocMonitorRegistry::instance(); + reg.deallocCalled([ptr, &tag]() { original(ptr, tag); }, [ptr]() { return malloc_usable_size(ptr); }); +} //_ZdlPvRKSt9nothrow_t +void operator delete[](void* ptr, const std::nothrow_t& tag) noexcept { + static auto original = get("_ZdaPvRKSt9nothrow_t"); + + if (not alloc_monitor_running_state()) { + original(ptr, tag); + return; + } + auto& reg = AllocMonitorRegistry::instance(); + reg.deallocCalled([ptr, &tag]() { original(ptr, tag); }, [ptr]() { return malloc_usable_size(ptr); }); +} //_ZdaPvRKSt9nothrow_t + +void operator delete(void* ptr, std::align_val_t al, const std::nothrow_t& tag) noexcept { + static auto original = + get("_ZdlPvSt11align_val_tRKSt9nothrow_t"); + + if (not alloc_monitor_running_state()) { + original(ptr, al, tag); + return; + } + auto& reg = AllocMonitorRegistry::instance(); + reg.deallocCalled([ptr, al, &tag]() { original(ptr, al, tag); }, [ptr]() { return malloc_usable_size(ptr); }); +} //_ZdlPvSt11align_val_tRKSt9nothrow_t +void operator delete[](void* ptr, std::align_val_t al, const std::nothrow_t& tag) noexcept { + static auto original = + get("_ZdaPvSt11align_val_tRKSt9nothrow_t"); + + if (not alloc_monitor_running_state()) { + original(ptr, al, tag); + return; + } + auto& reg = AllocMonitorRegistry::instance(); + reg.deallocCalled([ptr, al, &tag]() { original(ptr, al, tag); }, [ptr]() { return malloc_usable_size(ptr); }); +} //_ZdaPvSt11align_val_tRKSt9nothrow_t + +#endif diff --git a/PerfTools/MaxMemoryPreload/BuildFile.xml b/PerfTools/MaxMemoryPreload/BuildFile.xml new file mode 100644 index 0000000000000..37e3b6bb6fecd --- /dev/null +++ b/PerfTools/MaxMemoryPreload/BuildFile.xml @@ -0,0 +1,6 @@ + + + + + + diff --git a/PerfTools/MaxMemoryPreload/README.md b/PerfTools/MaxMemoryPreload/README.md new file mode 100644 index 0000000000000..aee811960bfd2 --- /dev/null +++ b/PerfTools/MaxMemoryPreload/README.md @@ -0,0 +1,29 @@ +# PerfTools/MaxMemoryPreload Description + +## Introduction + +This package generated a library that is meant to be LD_PRELOADed along with libPrefToolsAllocMonitorPreload.so which +uses `cms::perftools::AllocMonitorRegistry` to register a monitor before an application begins. When the application +ends the monitor reports statistics about the allocations and deallocations. + +## Usage + +To use the package, one must issue the following LD_PRELOAD command before running the application (bash version +shown below) +``` +LD_PRELOAD="libPerfToolsAllocMonitorPreload.so libPerfToolsMaxMemoryPreload.so" +``` + +the order is important. + +## Reporting +When the application ends, the monitor will report the following to standard error: + +- Total amount of bytes requested by all allocation calls during the job. Note that actual _used_ allocation can be greater than requested as the allocator may require additional memory be assigned. +- The maximum amount of _used_ allocated memory that was in use at one time. +- The amount of _used_ memory allocated during the job that has yet to be reclaimed by calling deallocation. +- Number of calls made to allocation functions. +- Number of calls made to deallocation functions. +This service is multi-thread safe. Note that when run multi-threaded the maximum reported value will vary from job to job. + +If a job forks processes, the forked processes will also report the above information. diff --git a/PerfTools/MaxMemoryPreload/src/preload.cc b/PerfTools/MaxMemoryPreload/src/preload.cc new file mode 100644 index 0000000000000..5b923af1978f5 --- /dev/null +++ b/PerfTools/MaxMemoryPreload/src/preload.cc @@ -0,0 +1,81 @@ +// -*- C++ -*- +// +// Package: PerfTools/MaxMemoryPreload +// Class : preload +// +// Implementation: +// [Notes on implementation] +// +// Original Author: Christopher Jones +// Created: Wed, 23 Aug 2023 17:56:44 GMT +// + +// system include files +#include +#include + +// user include files +#include "PerfTools/AllocMonitor/interface/AllocMonitorBase.h" +#include "PerfTools/AllocMonitor/interface/AllocMonitorRegistry.h" + +// +// constants, enums and typedefs +// + +// +// static data member definitions +// + +namespace { + class MonitorAdaptor : public cms::perftools::AllocMonitorBase { + public: + MonitorAdaptor() noexcept = default; + ~MonitorAdaptor() noexcept override { performanceReport(); } + + private: + void allocCalled(size_t iRequested, size_t iActual) final { + nAllocations_.fetch_add(1, std::memory_order_acq_rel); + requested_.fetch_add(iRequested, std::memory_order_acq_rel); + + auto a = presentActual_.fetch_add(iActual, std::memory_order_acq_rel); + a += iActual; + auto max = maxActual_.load(std::memory_order_relaxed); + + while (a > max) { + if (maxActual_.compare_exchange_strong(max, a, std::memory_order_acq_rel)) { + break; + } + } + } + void deallocCalled(size_t iActual) final { + nDeallocations_.fetch_add(1, std::memory_order_acq_rel); + auto present = presentActual_.load(std::memory_order_acquire); + if (present >= iActual) { + presentActual_.fetch_sub(iActual, std::memory_order_acq_rel); + } + } + + void performanceReport() const { + auto finalRequested = requested_.load(std::memory_order_acquire); + auto maxActual = maxActual_.load(std::memory_order_acquire); + auto present = presentActual_.load(std::memory_order_acquire); + auto allocs = nAllocations_.load(std::memory_order_acquire); + auto deallocs = nDeallocations_.load(std::memory_order_acquire); + + std::cerr << "Memory Report" + << "\n total memory requested: " << finalRequested << "\n max memory used: " << maxActual + << "\n presently used: " << present << "\n # allocations calls: " << allocs + << "\n # deallocations calls: " << deallocs << "\n"; + } + + private: + std::atomic requested_ = 0; + std::atomic presentActual_ = 0; + std::atomic maxActual_ = 0; + std::atomic nAllocations_ = 0; + std::atomic nDeallocations_ = 0; + }; + + [[maybe_unused]] auto s_instance = + cms::perftools::AllocMonitorRegistry::instance().createAndRegisterMonitor(); +} // namespace