Skip to content

Commit

Permalink
[GPU] Use array for tracking memory usage instead of map (#25269)
Browse files Browse the repository at this point in the history
### Details:
- Any additional locking and synchronization on memory allocation might
have negative impact on MT execution.
- `std::map` has very slow access are requires lock on every access. We
can use `std::array` instead to hold compile time known number of
buckets.
 - `array` container has lower access latency and memory overhead.
 - We might me able to remove mutex lock on stat collection.
  • Loading branch information
ialbrecht authored Jul 29, 2024
1 parent a9bfd0f commit 131c944
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 39 deletions.
5 changes: 2 additions & 3 deletions src/plugins/intel_gpu/include/intel_gpu/runtime/engine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -167,10 +167,9 @@ class engine {
/// Create engine for given @p device and @p configuration
engine(const device::ptr device);
const device::ptr _device;
mutable std::mutex _mutex;

std::map<allocation_type, std::atomic<uint64_t>> _memory_usage_map;
std::map<allocation_type, std::atomic<uint64_t>> _peak_memory_usage_map;
std::array<std::atomic<uint64_t>, static_cast<size_t>(allocation_type::max_value)> _memory_usage_data{};
std::array<std::atomic<uint64_t>, static_cast<size_t>(allocation_type::max_value)> _peak_memory_usage_data{};
};

} // namespace cldnn
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ enum class allocation_type {
usm_host, // Accessible by host and device. Not Migratable
usm_shared, // Accessible by host and device. Migrtable.
usm_device, // Accessible only by device. Not migratable.
max_value, // Used for data array size. Shall be last
};

inline std::ostream& operator<<(std::ostream& out, const allocation_type& alloc_type) {
Expand Down
63 changes: 27 additions & 36 deletions src/plugins/intel_gpu/src/runtime/engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -197,65 +197,56 @@ memory_ptr engine::share_surface(const layout& layout, shared_surface surf, uint
#endif // _WIN32

uint64_t engine::get_max_used_device_memory() const {
std::lock_guard<std::mutex> guard(_mutex);
uint64_t total_peak_memory_usage {0};
for (auto const& m : _peak_memory_usage_map) {
total_peak_memory_usage += m.second.load();
for (auto const& m : _peak_memory_usage_data) {
total_peak_memory_usage += m.load();
}
return total_peak_memory_usage;
}

uint64_t engine::get_max_used_device_memory(allocation_type type) const {
std::lock_guard<std::mutex> guard(_mutex);
uint64_t peak_memory_usage {0};
auto iter = _peak_memory_usage_map.find(type);
if (iter != _peak_memory_usage_map.end()) {
peak_memory_usage = iter->second.load();
}
return peak_memory_usage;
return _peak_memory_usage_data[static_cast<size_t>(type)].load();
}

uint64_t engine::get_used_device_memory(allocation_type type) const {
std::lock_guard<std::mutex> guard(_mutex);
uint64_t memory_usage {0};
auto iter = _memory_usage_map.find(type);
if (iter != _memory_usage_map.end()) {
memory_usage = iter->second.load();
}
return memory_usage;
return _memory_usage_data[static_cast<size_t>(type)].load();
}

std::map<std::string, uint64_t> engine::get_memory_statistics() const {
std::lock_guard<std::mutex> guard(_mutex);
std::map<std::string, uint64_t> statistics;
for (auto const& m : _memory_usage_map) {
std::ostringstream oss;
oss << m.first;
statistics[oss.str()] = m.second.load();
}
const auto add_stat = [&](allocation_type type) {
auto idx = static_cast<size_t>(type);
auto value = _memory_usage_data[idx].load();
if (value != 0) {
std::ostringstream oss;
oss << type;
statistics[oss.str()] = value;
}
};

add_stat(allocation_type::unknown);
add_stat(allocation_type::cl_mem);
add_stat(allocation_type::usm_host);
add_stat(allocation_type::usm_shared);
add_stat(allocation_type::usm_device);
return statistics;
}

void engine::add_memory_used(uint64_t bytes, allocation_type type) {
std::lock_guard<std::mutex> guard(_mutex);
if (!_memory_usage_map.count(type) && !_peak_memory_usage_map.count(type)) {
_memory_usage_map[type] = 0;
_peak_memory_usage_map[type] = 0;
}
_memory_usage_map[type] += bytes;
if (_memory_usage_map[type] > _peak_memory_usage_map[type]) {
_peak_memory_usage_map[type] = _memory_usage_map[type].load();
auto idx = static_cast<size_t>(type);
const auto new_val = _memory_usage_data[idx].fetch_add(bytes) + bytes;
// Make sure actual maximum value is stored
while (new_val > _peak_memory_usage_data[idx]) {
_peak_memory_usage_data[idx] = new_val;
}
}

void engine::subtract_memory_used(uint64_t bytes, allocation_type type) {
std::lock_guard<std::mutex> guard(_mutex);
auto iter = _memory_usage_map.find(type);
if (iter != _memory_usage_map.end()) {
_memory_usage_map[type] -= bytes;
} else {
auto idx = static_cast<size_t>(type);
if (_memory_usage_data[idx].load() < bytes) {
throw std::runtime_error("Attempt to free unallocated memory");
}
_memory_usage_data[idx] -= bytes;
}

std::shared_ptr<cldnn::engine> engine::create(engine_types engine_type, runtime_types runtime_type, const device::ptr device) {
Expand Down

0 comments on commit 131c944

Please sign in to comment.