[GPU] Use array for tracking memory usage instead of map (#25269)

### Details: - Any additional locking and synchronization on memory allocation might have negative impact on MT execution. - `std::map` has very slow access are requires lock on every access. We can use `std::array` instead to hold compile time known number of buckets. - `array` container has lower access latency and memory overhead. - We might me able to remove mutex lock on stat collection.
openvinotoolkit · Jul 29, 2024 · 131c944 · 131c944
1 parent a9bfd0f
commit 131c944
Show file tree

Hide file tree

Showing 3 changed files with 30 additions and 39 deletions.
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp
@@ -167,10 +167,9 @@ class engine {
     /// Create engine for given @p device and @p configuration
     engine(const device::ptr device);
     const device::ptr _device;
-    mutable std::mutex _mutex;
 
-    std::map<allocation_type, std::atomic<uint64_t>> _memory_usage_map;
-    std::map<allocation_type, std::atomic<uint64_t>> _peak_memory_usage_map;
+    std::array<std::atomic<uint64_t>, static_cast<size_t>(allocation_type::max_value)> _memory_usage_data{};
+    std::array<std::atomic<uint64_t>, static_cast<size_t>(allocation_type::max_value)> _peak_memory_usage_data{};
 };
 
 }  // namespace cldnn
diff --git a/src/plugins/intel_gpu/include/intel_gpu/runtime/memory_caps.hpp b/src/plugins/intel_gpu/include/intel_gpu/runtime/memory_caps.hpp
@@ -18,6 +18,7 @@ enum class allocation_type {
     usm_host,    // Accessible by host and device. Not Migratable
     usm_shared,  // Accessible by host and device. Migrtable.
     usm_device,  // Accessible only by device. Not migratable.
+    max_value,   // Used for data array size. Shall be last
 };
 
 inline std::ostream& operator<<(std::ostream& out, const allocation_type& alloc_type) {

diff --git a/src/plugins/intel_gpu/src/runtime/engine.cpp b/src/plugins/intel_gpu/src/runtime/engine.cpp
@@ -197,65 +197,56 @@ memory_ptr engine::share_surface(const layout& layout, shared_surface surf, uint
 #endif  // _WIN32
 
 uint64_t engine::get_max_used_device_memory() const {
-    std::lock_guard<std::mutex> guard(_mutex);
     uint64_t total_peak_memory_usage {0};
-    for (auto const& m : _peak_memory_usage_map) {
-        total_peak_memory_usage += m.second.load();
+    for (auto const& m : _peak_memory_usage_data) {
+        total_peak_memory_usage += m.load();
     }
     return total_peak_memory_usage;
 }
 
 uint64_t engine::get_max_used_device_memory(allocation_type type) const {
-    std::lock_guard<std::mutex> guard(_mutex);
-    uint64_t peak_memory_usage {0};
-    auto iter = _peak_memory_usage_map.find(type);
-    if (iter != _peak_memory_usage_map.end()) {
-        peak_memory_usage = iter->second.load();
-    }
-    return peak_memory_usage;
+    return _peak_memory_usage_data[static_cast<size_t>(type)].load();
 }
 
 uint64_t engine::get_used_device_memory(allocation_type type) const {
-    std::lock_guard<std::mutex> guard(_mutex);
-    uint64_t memory_usage {0};
-    auto iter = _memory_usage_map.find(type);
-    if (iter != _memory_usage_map.end()) {
-        memory_usage = iter->second.load();
-    }
-    return memory_usage;
+    return _memory_usage_data[static_cast<size_t>(type)].load();
 }
 
 std::map<std::string, uint64_t> engine::get_memory_statistics() const {
-    std::lock_guard<std::mutex> guard(_mutex);
     std::map<std::string, uint64_t> statistics;
-    for (auto const& m : _memory_usage_map) {
-        std::ostringstream oss;
-        oss << m.first;
-        statistics[oss.str()] = m.second.load();
-    }
+    const auto add_stat = [&](allocation_type type) {
+        auto idx = static_cast<size_t>(type);
+        auto value = _memory_usage_data[idx].load();
+        if (value != 0) {
+            std::ostringstream oss;
+            oss << type;
+            statistics[oss.str()] = value;
+        }
+    };
+
+    add_stat(allocation_type::unknown);
+    add_stat(allocation_type::cl_mem);
+    add_stat(allocation_type::usm_host);
+    add_stat(allocation_type::usm_shared);
+    add_stat(allocation_type::usm_device);
     return statistics;
 }
 
 void engine::add_memory_used(uint64_t bytes, allocation_type type) {
-    std::lock_guard<std::mutex> guard(_mutex);
-    if (!_memory_usage_map.count(type) && !_peak_memory_usage_map.count(type)) {
-        _memory_usage_map[type] = 0;
-        _peak_memory_usage_map[type] = 0;
-    }
-    _memory_usage_map[type] += bytes;
-    if (_memory_usage_map[type] > _peak_memory_usage_map[type]) {
-        _peak_memory_usage_map[type] = _memory_usage_map[type].load();
+    auto idx = static_cast<size_t>(type);
+    const auto new_val = _memory_usage_data[idx].fetch_add(bytes) + bytes;
+    // Make sure actual maximum value is stored
+    while (new_val > _peak_memory_usage_data[idx]) {
+        _peak_memory_usage_data[idx] = new_val;
     }
 }
 
 void engine::subtract_memory_used(uint64_t bytes, allocation_type type) {
-    std::lock_guard<std::mutex> guard(_mutex);
-    auto iter = _memory_usage_map.find(type);
-    if (iter != _memory_usage_map.end()) {
-        _memory_usage_map[type] -= bytes;
-    } else {
+    auto idx = static_cast<size_t>(type);
+    if (_memory_usage_data[idx].load() < bytes) {
         throw std::runtime_error("Attempt to free unallocated memory");
     }
+    _memory_usage_data[idx] -= bytes;
 }
 
 std::shared_ptr<cldnn::engine> engine::create(engine_types engine_type, runtime_types runtime_type, const device::ptr device) {