From b8ebd4c7f9ba0ea4c6ce2ac7d9fe6ccf2afbbbc2 Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Tue, 17 Oct 2023 09:53:44 -0700 Subject: [PATCH 1/2] Address review comments --- src/callbacks/gpu_memory_usage.cpp | 9 ++------- src/callbacks/memory_profiler.cpp | 19 ++++--------------- 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/src/callbacks/gpu_memory_usage.cpp b/src/callbacks/gpu_memory_usage.cpp index bef38a88782..26e37d55c6c 100644 --- a/src/callbacks/gpu_memory_usage.cpp +++ b/src/callbacks/gpu_memory_usage.cpp @@ -30,6 +30,7 @@ #include "lbann/models/model.hpp" #include "lbann/utils/gpu/helpers.hpp" #include "lbann/utils/serialize.hpp" +#include

#include #include @@ -79,13 +80,7 @@ void gpu_memory_usage::write_specific_proto(lbann_data::Callback& proto) const void gpu_memory_usage::on_epoch_begin(model* m) { #ifdef LBANN_HAS_GPU - size_t available; - size_t total; -#ifdef LBANN_HAS_CUDA - FORCE_CHECK_CUDA(cudaMemGetInfo(&available, &total)); -#elif defined(LBANN_HAS_ROCM) - FORCE_CHECK_ROCM(hipMemGetInfo(&available, &total)); -#endif + auto const [available, total] = h2::gpu::mem_info(); size_t used = total - available; auto comm = m->get_comm(); if (comm->am_trainer_master()) { diff --git a/src/callbacks/memory_profiler.cpp b/src/callbacks/memory_profiler.cpp index 8d770a8ab8a..2b0dbd604ba 100644 --- a/src/callbacks/memory_profiler.cpp +++ b/src/callbacks/memory_profiler.cpp @@ -39,6 +39,7 @@ #include "h2/patterns/multimethods/SwitchDispatcher.hpp" #include +#include

#include namespace lbann { @@ -169,17 +170,12 @@ size_t get_activation_and_error_signal_size(Layer const& x, std::ostream& os) /** * @brief Returns the currently used memory, or 0 if LBANN was not compiled with * GPU support. + * TODO(later): Gather across all ranks? */ size_t get_used_gpu_memory() { #ifdef LBANN_HAS_GPU - size_t available; - size_t total; -#ifdef LBANN_HAS_CUDA - FORCE_CHECK_CUDA(cudaMemGetInfo(&available, &total)); -#elif defined(LBANN_HAS_ROCM) - FORCE_CHECK_ROCM(hipMemGetInfo(&available, &total)); -#endif + auto const [available, total] = h2::gpu::mem_info(); // TODO(later): Might be nicer to return a struct with gathered information // (min, max, median across ranks) return total - available; @@ -195,14 +191,7 @@ size_t get_used_gpu_memory() static inline size_t get_total_gpu_memory() { #ifdef LBANN_HAS_GPU - size_t available; - size_t total; -#ifdef LBANN_HAS_CUDA - FORCE_CHECK_CUDA(cudaMemGetInfo(&available, &total)); -#elif defined(LBANN_HAS_ROCM) - FORCE_CHECK_ROCM(hipMemGetInfo(&available, &total)); -#endif - return total; + return h2::gpu::mem_info().total; #else return 0; #endif From 550901422886c144085cf8098091d640b0191b9c Mon Sep 17 00:00:00 2001 From: Tal Ben-Nun Date: Tue, 17 Oct 2023 13:37:53 -0700 Subject: [PATCH 2/2] Conditional inclusion --- src/callbacks/gpu_memory_usage.cpp | 5 ++++- src/callbacks/memory_profiler.cpp | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/callbacks/gpu_memory_usage.cpp b/src/callbacks/gpu_memory_usage.cpp index 26e37d55c6c..4da095daac6 100644 --- a/src/callbacks/gpu_memory_usage.cpp +++ b/src/callbacks/gpu_memory_usage.cpp @@ -30,10 +30,13 @@ #include "lbann/models/model.hpp" #include "lbann/utils/gpu/helpers.hpp" #include "lbann/utils/serialize.hpp" -#include

#include #include +#ifdef LBANN_HAS_GPU +#include

+#endif + #include "lbann/proto/callbacks.pb.h" namespace { diff --git a/src/callbacks/memory_profiler.cpp b/src/callbacks/memory_profiler.cpp index 2b0dbd604ba..441b1089898 100644 --- a/src/callbacks/memory_profiler.cpp +++ b/src/callbacks/memory_profiler.cpp @@ -39,9 +39,12 @@ #include "h2/patterns/multimethods/SwitchDispatcher.hpp" #include -#include

#include +#ifdef LBANN_HAS_GPU +#include

+#endif + namespace lbann { namespace callback { namespace {