From 131c944a258dfd625cf87667d764cb4103e73d81 Mon Sep 17 00:00:00 2001 From: Ilya Albrecht Date: Mon, 29 Jul 2024 08:33:05 -0700 Subject: [PATCH] [GPU] Use array for tracking memory usage instead of map (#25269) ### Details: - Any additional locking and synchronization on memory allocation might have negative impact on MT execution. - `std::map` has very slow access are requires lock on every access. We can use `std::array` instead to hold compile time known number of buckets. - `array` container has lower access latency and memory overhead. - We might me able to remove mutex lock on stat collection. --- .../include/intel_gpu/runtime/engine.hpp | 5 +- .../include/intel_gpu/runtime/memory_caps.hpp | 1 + src/plugins/intel_gpu/src/runtime/engine.cpp | 63 ++++++++----------- 3 files changed, 30 insertions(+), 39 deletions(-) diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp index 320e2b466de5a4..7e77ceb6785cb5 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp @@ -167,10 +167,9 @@ class engine { /// Create engine for given @p device and @p configuration engine(const device::ptr device); const device::ptr _device; - mutable std::mutex _mutex; - std::map> _memory_usage_map; - std::map> _peak_memory_usage_map; + std::array, static_cast(allocation_type::max_value)> _memory_usage_data{}; + std::array, static_cast(allocation_type::max_value)> _peak_memory_usage_data{}; }; } // namespace cldnn diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory_caps.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory_caps.hpp index 306a23fe1c3aaa..0a8da995d9af02 100644 --- a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory_caps.hpp +++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory_caps.hpp @@ -18,6 +18,7 @@ enum class allocation_type { usm_host, // Accessible by host and device. Not Migratable usm_shared, // Accessible by host and device. Migrtable. usm_device, // Accessible only by device. Not migratable. + max_value, // Used for data array size. Shall be last }; inline std::ostream& operator<<(std::ostream& out, const allocation_type& alloc_type) { diff --git a/src/plugins/intel_gpu/src/runtime/engine.cpp b/src/plugins/intel_gpu/src/runtime/engine.cpp index ec0beef6a8aa31..73da14f6e16f47 100644 --- a/src/plugins/intel_gpu/src/runtime/engine.cpp +++ b/src/plugins/intel_gpu/src/runtime/engine.cpp @@ -197,65 +197,56 @@ memory_ptr engine::share_surface(const layout& layout, shared_surface surf, uint #endif // _WIN32 uint64_t engine::get_max_used_device_memory() const { - std::lock_guard guard(_mutex); uint64_t total_peak_memory_usage {0}; - for (auto const& m : _peak_memory_usage_map) { - total_peak_memory_usage += m.second.load(); + for (auto const& m : _peak_memory_usage_data) { + total_peak_memory_usage += m.load(); } return total_peak_memory_usage; } uint64_t engine::get_max_used_device_memory(allocation_type type) const { - std::lock_guard guard(_mutex); - uint64_t peak_memory_usage {0}; - auto iter = _peak_memory_usage_map.find(type); - if (iter != _peak_memory_usage_map.end()) { - peak_memory_usage = iter->second.load(); - } - return peak_memory_usage; + return _peak_memory_usage_data[static_cast(type)].load(); } uint64_t engine::get_used_device_memory(allocation_type type) const { - std::lock_guard guard(_mutex); - uint64_t memory_usage {0}; - auto iter = _memory_usage_map.find(type); - if (iter != _memory_usage_map.end()) { - memory_usage = iter->second.load(); - } - return memory_usage; + return _memory_usage_data[static_cast(type)].load(); } std::map engine::get_memory_statistics() const { - std::lock_guard guard(_mutex); std::map statistics; - for (auto const& m : _memory_usage_map) { - std::ostringstream oss; - oss << m.first; - statistics[oss.str()] = m.second.load(); - } + const auto add_stat = [&](allocation_type type) { + auto idx = static_cast(type); + auto value = _memory_usage_data[idx].load(); + if (value != 0) { + std::ostringstream oss; + oss << type; + statistics[oss.str()] = value; + } + }; + + add_stat(allocation_type::unknown); + add_stat(allocation_type::cl_mem); + add_stat(allocation_type::usm_host); + add_stat(allocation_type::usm_shared); + add_stat(allocation_type::usm_device); return statistics; } void engine::add_memory_used(uint64_t bytes, allocation_type type) { - std::lock_guard guard(_mutex); - if (!_memory_usage_map.count(type) && !_peak_memory_usage_map.count(type)) { - _memory_usage_map[type] = 0; - _peak_memory_usage_map[type] = 0; - } - _memory_usage_map[type] += bytes; - if (_memory_usage_map[type] > _peak_memory_usage_map[type]) { - _peak_memory_usage_map[type] = _memory_usage_map[type].load(); + auto idx = static_cast(type); + const auto new_val = _memory_usage_data[idx].fetch_add(bytes) + bytes; + // Make sure actual maximum value is stored + while (new_val > _peak_memory_usage_data[idx]) { + _peak_memory_usage_data[idx] = new_val; } } void engine::subtract_memory_used(uint64_t bytes, allocation_type type) { - std::lock_guard guard(_mutex); - auto iter = _memory_usage_map.find(type); - if (iter != _memory_usage_map.end()) { - _memory_usage_map[type] -= bytes; - } else { + auto idx = static_cast(type); + if (_memory_usage_data[idx].load() < bytes) { throw std::runtime_error("Attempt to free unallocated memory"); } + _memory_usage_data[idx] -= bytes; } std::shared_ptr engine::create(engine_types engine_type, runtime_types runtime_type, const device::ptr device) {