From b8ebd4c7f9ba0ea4c6ce2ac7d9fe6ccf2afbbbc2 Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <talbn@llnl.gov>
Date: Tue, 17 Oct 2023 09:53:44 -0700
Subject: [PATCH 1/2] Address review comments

---
 src/callbacks/gpu_memory_usage.cpp |  9 ++-------
 src/callbacks/memory_profiler.cpp  | 19 ++++---------------
 2 files changed, 6 insertions(+), 22 deletions(-)
diff --git a/src/callbacks/gpu_memory_usage.cpp b/src/callbacks/gpu_memory_usage.cpp
index bef38a88782..26e37d55c6c 100644
--- a/src/callbacks/gpu_memory_usage.cpp
+++ b/src/callbacks/gpu_memory_usage.cpp
@@ -30,6 +30,7 @@
 #include "lbann/models/model.hpp"
 #include "lbann/utils/gpu/helpers.hpp"
 #include "lbann/utils/serialize.hpp"
+#include <h2/gpu/memory_utils.hpp>
 #include <iomanip>
 #include <sstream>
 
@@ -79,13 +80,7 @@ void gpu_memory_usage::write_specific_proto(lbann_data::Callback& proto) const
 void gpu_memory_usage::on_epoch_begin(model* m)
 {
 #ifdef LBANN_HAS_GPU
-  size_t available;
-  size_t total;
-#ifdef LBANN_HAS_CUDA
-  FORCE_CHECK_CUDA(cudaMemGetInfo(&available, &total));
-#elif defined(LBANN_HAS_ROCM)
-  FORCE_CHECK_ROCM(hipMemGetInfo(&available, &total));
-#endif
+  auto const [available, total] = h2::gpu::mem_info();
   size_t used = total - available;
   auto comm = m->get_comm();
   if (comm->am_trainer_master()) {
diff --git a/src/callbacks/memory_profiler.cpp b/src/callbacks/memory_profiler.cpp
index 8d770a8ab8a..2b0dbd604ba 100644
--- a/src/callbacks/memory_profiler.cpp
+++ b/src/callbacks/memory_profiler.cpp
@@ -39,6 +39,7 @@
 #include "h2/patterns/multimethods/SwitchDispatcher.hpp"
 
 #include <algorithm>
+#include <h2/gpu/memory_utils.hpp>
 #include <string>
 
 namespace lbann {
@@ -169,17 +170,12 @@ size_t get_activation_and_error_signal_size(Layer const& x, std::ostream& os)
 /**
  * @brief Returns the currently used memory, or 0 if LBANN was not compiled with
  * GPU support.
+ * TODO(later): Gather across all ranks?
  */
 size_t get_used_gpu_memory()
 {
 #ifdef LBANN_HAS_GPU
-  size_t available;
-  size_t total;
-#ifdef LBANN_HAS_CUDA
-  FORCE_CHECK_CUDA(cudaMemGetInfo(&available, &total));
-#elif defined(LBANN_HAS_ROCM)
-  FORCE_CHECK_ROCM(hipMemGetInfo(&available, &total));
-#endif
+  auto const [available, total] = h2::gpu::mem_info();
   // TODO(later): Might be nicer to return a struct with gathered information
   // (min, max, median across ranks)
   return total - available;
@@ -195,14 +191,7 @@ size_t get_used_gpu_memory()
 static inline size_t get_total_gpu_memory()
 {
 #ifdef LBANN_HAS_GPU
-  size_t available;
-  size_t total;
-#ifdef LBANN_HAS_CUDA
-  FORCE_CHECK_CUDA(cudaMemGetInfo(&available, &total));
-#elif defined(LBANN_HAS_ROCM)
-  FORCE_CHECK_ROCM(hipMemGetInfo(&available, &total));
-#endif
-  return total;
+  return h2::gpu::mem_info().total;
 #else
   return 0;
 #endif

From 550901422886c144085cf8098091d640b0191b9c Mon Sep 17 00:00:00 2001
From: Tal Ben-Nun <talbn@llnl.gov>
Date: Tue, 17 Oct 2023 13:37:53 -0700
Subject: [PATCH 2/2] Conditional inclusion

---
 src/callbacks/gpu_memory_usage.cpp | 5 ++++-
 src/callbacks/memory_profiler.cpp  | 5 ++++-
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/callbacks/gpu_memory_usage.cpp b/src/callbacks/gpu_memory_usage.cpp
index 26e37d55c6c..4da095daac6 100644
--- a/src/callbacks/gpu_memory_usage.cpp
+++ b/src/callbacks/gpu_memory_usage.cpp
@@ -30,10 +30,13 @@
 #include "lbann/models/model.hpp"
 #include "lbann/utils/gpu/helpers.hpp"
 #include "lbann/utils/serialize.hpp"
-#include <h2/gpu/memory_utils.hpp>
 #include <iomanip>
 #include <sstream>
 
+#ifdef LBANN_HAS_GPU
+#include <h2/gpu/memory_utils.hpp>
+#endif
+
 #include "lbann/proto/callbacks.pb.h"
 
 namespace {
diff --git a/src/callbacks/memory_profiler.cpp b/src/callbacks/memory_profiler.cpp
index 2b0dbd604ba..441b1089898 100644
--- a/src/callbacks/memory_profiler.cpp
+++ b/src/callbacks/memory_profiler.cpp
@@ -39,9 +39,12 @@
 #include "h2/patterns/multimethods/SwitchDispatcher.hpp"
 
 #include <algorithm>
-#include <h2/gpu/memory_utils.hpp>
 #include <string>
 
+#ifdef LBANN_HAS_GPU
+#include <h2/gpu/memory_utils.hpp>
+#endif
+
 namespace lbann {
 namespace callback {
 namespace {