From 131c944a258dfd625cf87667d764cb4103e73d81 Mon Sep 17 00:00:00 2001
From: Ilya Albrecht <ilya.albrecht@intel.com>
Date: Mon, 29 Jul 2024 08:33:05 -0700
Subject: [PATCH] [GPU] Use array for tracking memory usage instead of map
 (#25269)

### Details:
- Any additional locking and synchronization on memory allocation might
have negative impact on MT execution.
- `std::map` has very slow access are requires lock on every access. We
can use `std::array` instead to hold compile time known number of
buckets.
 - `array` container has lower access latency and memory overhead.
 - We might me able to remove mutex lock on stat collection.
---
 .../include/intel_gpu/runtime/engine.hpp      |  5 +-
 .../include/intel_gpu/runtime/memory_caps.hpp |  1 +
 src/plugins/intel_gpu/src/runtime/engine.cpp  | 63 ++++++++-----------
 3 files changed, 30 insertions(+), 39 deletions(-)

diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp
index 320e2b466de5a4..7e77ceb6785cb5 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp
@@ -167,10 +167,9 @@ class engine {
     /// Create engine for given @p device and @p configuration
     engine(const device::ptr device);
     const device::ptr _device;
-    mutable std::mutex _mutex;
 
-    std::map<allocation_type, std::atomic<uint64_t>> _memory_usage_map;
-    std::map<allocation_type, std::atomic<uint64_t>> _peak_memory_usage_map;
+    std::array<std::atomic<uint64_t>, static_cast<size_t>(allocation_type::max_value)> _memory_usage_data{};
+    std::array<std::atomic<uint64_t>, static_cast<size_t>(allocation_type::max_value)> _peak_memory_usage_data{};
 };
 
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory_caps.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory_caps.hpp
index 306a23fe1c3aaa..0a8da995d9af02 100644
--- a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory_caps.hpp
+++ b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory_caps.hpp
@@ -18,6 +18,7 @@ enum class allocation_type {
     usm_host,    // Accessible by host and device. Not Migratable
     usm_shared,  // Accessible by host and device. Migrtable.
     usm_device,  // Accessible only by device. Not migratable.
+    max_value,   // Used for data array size. Shall be last
 };
 
 inline std::ostream& operator<<(std::ostream& out, const allocation_type& alloc_type) {
diff --git a/src/plugins/intel_gpu/src/runtime/engine.cpp b/src/plugins/intel_gpu/src/runtime/engine.cpp
index ec0beef6a8aa31..73da14f6e16f47 100644
--- a/src/plugins/intel_gpu/src/runtime/engine.cpp
+++ b/src/plugins/intel_gpu/src/runtime/engine.cpp
@@ -197,65 +197,56 @@ memory_ptr engine::share_surface(const layout& layout, shared_surface surf, uint
 #endif  // _WIN32
 
 uint64_t engine::get_max_used_device_memory() const {
-    std::lock_guard<std::mutex> guard(_mutex);
     uint64_t total_peak_memory_usage {0};
-    for (auto const& m : _peak_memory_usage_map) {
-        total_peak_memory_usage += m.second.load();
+    for (auto const& m : _peak_memory_usage_data) {
+        total_peak_memory_usage += m.load();
     }
     return total_peak_memory_usage;
 }
 
 uint64_t engine::get_max_used_device_memory(allocation_type type) const {
-    std::lock_guard<std::mutex> guard(_mutex);
-    uint64_t peak_memory_usage {0};
-    auto iter = _peak_memory_usage_map.find(type);
-    if (iter != _peak_memory_usage_map.end()) {
-        peak_memory_usage = iter->second.load();
-    }
-    return peak_memory_usage;
+    return _peak_memory_usage_data[static_cast<size_t>(type)].load();
 }
 
 uint64_t engine::get_used_device_memory(allocation_type type) const {
-    std::lock_guard<std::mutex> guard(_mutex);
-    uint64_t memory_usage {0};
-    auto iter = _memory_usage_map.find(type);
-    if (iter != _memory_usage_map.end()) {
-        memory_usage = iter->second.load();
-    }
-    return memory_usage;
+    return _memory_usage_data[static_cast<size_t>(type)].load();
 }
 
 std::map<std::string, uint64_t> engine::get_memory_statistics() const {
-    std::lock_guard<std::mutex> guard(_mutex);
     std::map<std::string, uint64_t> statistics;
-    for (auto const& m : _memory_usage_map) {
-        std::ostringstream oss;
-        oss << m.first;
-        statistics[oss.str()] = m.second.load();
-    }
+    const auto add_stat = [&](allocation_type type) {
+        auto idx = static_cast<size_t>(type);
+        auto value = _memory_usage_data[idx].load();
+        if (value != 0) {
+            std::ostringstream oss;
+            oss << type;
+            statistics[oss.str()] = value;
+        }
+    };
+
+    add_stat(allocation_type::unknown);
+    add_stat(allocation_type::cl_mem);
+    add_stat(allocation_type::usm_host);
+    add_stat(allocation_type::usm_shared);
+    add_stat(allocation_type::usm_device);
     return statistics;
 }
 
 void engine::add_memory_used(uint64_t bytes, allocation_type type) {
-    std::lock_guard<std::mutex> guard(_mutex);
-    if (!_memory_usage_map.count(type) && !_peak_memory_usage_map.count(type)) {
-        _memory_usage_map[type] = 0;
-        _peak_memory_usage_map[type] = 0;
-    }
-    _memory_usage_map[type] += bytes;
-    if (_memory_usage_map[type] > _peak_memory_usage_map[type]) {
-        _peak_memory_usage_map[type] = _memory_usage_map[type].load();
+    auto idx = static_cast<size_t>(type);
+    const auto new_val = _memory_usage_data[idx].fetch_add(bytes) + bytes;
+    // Make sure actual maximum value is stored
+    while (new_val > _peak_memory_usage_data[idx]) {
+        _peak_memory_usage_data[idx] = new_val;
     }
 }
 
 void engine::subtract_memory_used(uint64_t bytes, allocation_type type) {
-    std::lock_guard<std::mutex> guard(_mutex);
-    auto iter = _memory_usage_map.find(type);
-    if (iter != _memory_usage_map.end()) {
-        _memory_usage_map[type] -= bytes;
-    } else {
+    auto idx = static_cast<size_t>(type);
+    if (_memory_usage_data[idx].load() < bytes) {
         throw std::runtime_error("Attempt to free unallocated memory");
     }
+    _memory_usage_data[idx] -= bytes;
 }
 
 std::shared_ptr<cldnn::engine> engine::create(engine_types engine_type, runtime_types runtime_type, const device::ptr device) {