From e1a577d9f19abaab99003f674c0a50643a606e3d Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Thu, 15 Feb 2024 17:31:22 -0800 Subject: [PATCH 1/2] add memory metrics to TensorBoard [ghstack-poisoned] --- torchtrain/metrics.py | 6 +++--- train.py | 12 ++++++++++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/torchtrain/metrics.py b/torchtrain/metrics.py index 092cafae..cad98e85 100644 --- a/torchtrain/metrics.py +++ b/torchtrain/metrics.py @@ -122,15 +122,15 @@ def get_current_stats(self, return_data: bool = False): ) display_str = "" - display_str += f"Current Memory: {self.device_name} ({self.device_index}): Reserved: {self.device_reserved_memory_pct}%," - display_str += f"Alloc {self.device_alloc_memory_pct}%, Active: {self.device_active_memory_pct}%\n" + display_str += f"Current Memory: {self.device_name} ({self.device_index}): Reserved: {self.device_reserved_memory_pct}%, " + display_str += f"Alloc {self.device_alloc_memory_pct}%, Active: {self.device_active_memory_pct}%\n" self.get_peak_stats(curr_mem) peak_active_pct = self.get_pct_memory(self.peak_active_memory) peak_allocated_pct = self.get_pct_memory(self.peak_allocated_memory) peak_reserved_pct = self.get_pct_memory(self.peak_reserved_memory) - display_str += f"Peak Memory: Reserved {peak_reserved_pct}%, Alloc {peak_allocated_pct}%, Active: {peak_active_pct}%\n" + display_str += f"Peak Memory: Reserved {peak_reserved_pct}%, Alloc {peak_allocated_pct}%, Active: {peak_active_pct}%\n" display_str += f"num retries: {self.num_retries}, num ooms: {self.num_ooms}" if self.num_retries > 0: diff --git a/train.py b/train.py index e922acf3..e4d3c01b 100644 --- a/train.py +++ b/train.py @@ -219,10 +219,18 @@ def main(args): time_delta * parallel_dims.model_parallel_size ) + gpu_mem_stats = gpu_metrics.get_current_stats(return_data=True) + metrics = { - "global_avg_loss": global_avg_loss, - "global_max_loss": global_max_loss, + "loss/global_avg": global_avg_loss, + "loss/global_max": global_max_loss, "wps": wps, + "memory_current/active(%)": gpu_mem_stats.active_curr, + "memory_current/allocated(%)": gpu_mem_stats.allocated_curr, + "memory_current/reserved(%)": gpu_mem_stats.reserved_curr, + "memory_peak/active(%)": gpu_mem_stats.active_peak, + "memory_peak/allocated(%)": gpu_mem_stats.allocated_peak, + "memory_peak/reserved(%)": gpu_mem_stats.reserved_peak, } metric_logger.log(metrics, step=train_state.step) From b77c89f14efa6e742d3e38955878a68d4c243016 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Fri, 16 Feb 2024 17:38:04 -0800 Subject: [PATCH 2/2] Update on "add memory metrics to TensorBoard" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Screenshot 2024-02-15 at 5 19 09 PM [ghstack-poisoned] --- train.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/train.py b/train.py index e4d3c01b..faabc2d8 100644 --- a/train.py +++ b/train.py @@ -222,8 +222,8 @@ def main(args): gpu_mem_stats = gpu_metrics.get_current_stats(return_data=True) metrics = { - "loss/global_avg": global_avg_loss, - "loss/global_max": global_max_loss, + "loss_metrics/global_avg_loss": global_avg_loss, + "loss_metrics/global_max_loss": global_max_loss, "wps": wps, "memory_current/active(%)": gpu_mem_stats.active_curr, "memory_current/allocated(%)": gpu_mem_stats.allocated_curr,