diff --git a/src/callbacks/gpu_memory_usage.cpp b/src/callbacks/gpu_memory_usage.cpp index bef38a88782..4da095daac6 100644 --- a/src/callbacks/gpu_memory_usage.cpp +++ b/src/callbacks/gpu_memory_usage.cpp @@ -33,6 +33,10 @@ #include #include +#ifdef LBANN_HAS_GPU +#include

+#endif + #include "lbann/proto/callbacks.pb.h" namespace { @@ -79,13 +83,7 @@ void gpu_memory_usage::write_specific_proto(lbann_data::Callback& proto) const void gpu_memory_usage::on_epoch_begin(model* m) { #ifdef LBANN_HAS_GPU - size_t available; - size_t total; -#ifdef LBANN_HAS_CUDA - FORCE_CHECK_CUDA(cudaMemGetInfo(&available, &total)); -#elif defined(LBANN_HAS_ROCM) - FORCE_CHECK_ROCM(hipMemGetInfo(&available, &total)); -#endif + auto const [available, total] = h2::gpu::mem_info(); size_t used = total - available; auto comm = m->get_comm(); if (comm->am_trainer_master()) { diff --git a/src/callbacks/memory_profiler.cpp b/src/callbacks/memory_profiler.cpp index 8d770a8ab8a..441b1089898 100644 --- a/src/callbacks/memory_profiler.cpp +++ b/src/callbacks/memory_profiler.cpp @@ -41,6 +41,10 @@ #include #include +#ifdef LBANN_HAS_GPU +#include

+#endif + namespace lbann { namespace callback { namespace { @@ -169,17 +173,12 @@ size_t get_activation_and_error_signal_size(Layer const& x, std::ostream& os) /** * @brief Returns the currently used memory, or 0 if LBANN was not compiled with * GPU support. + * TODO(later): Gather across all ranks? */ size_t get_used_gpu_memory() { #ifdef LBANN_HAS_GPU - size_t available; - size_t total; -#ifdef LBANN_HAS_CUDA - FORCE_CHECK_CUDA(cudaMemGetInfo(&available, &total)); -#elif defined(LBANN_HAS_ROCM) - FORCE_CHECK_ROCM(hipMemGetInfo(&available, &total)); -#endif + auto const [available, total] = h2::gpu::mem_info(); // TODO(later): Might be nicer to return a struct with gathered information // (min, max, median across ranks) return total - available; @@ -195,14 +194,7 @@ size_t get_used_gpu_memory() static inline size_t get_total_gpu_memory() { #ifdef LBANN_HAS_GPU - size_t available; - size_t total; -#ifdef LBANN_HAS_CUDA - FORCE_CHECK_CUDA(cudaMemGetInfo(&available, &total)); -#elif defined(LBANN_HAS_ROCM) - FORCE_CHECK_ROCM(hipMemGetInfo(&available, &total)); -#endif - return total; + return h2::gpu::mem_info().total; #else return 0; #endif