From 23698614e4370d7f3fb6cd6ecf41b7c6a4e8a2ac Mon Sep 17 00:00:00 2001
From: Tianyu Liu <lty@fb.com>
Date: Tue, 12 Mar 2024 17:58:51 -0700
Subject: [PATCH] improve logging

ghstack-source-id: de61ec093b43a2ccbf1156c76ba81ecd698a6a8a
Pull Request resolved: https://github.com/pytorch/torchtrain/pull/132
---
 torchtrain/checkpoint.py                     | 16 ++--
 torchtrain/datasets/hf_datasets.py           | 27 ++----
 torchtrain/datasets/tokenizer.py             | 22 +++--
 torchtrain/float8_linear.py                  |  5 +-
 torchtrain/logging_utils.py                  |  9 +-
 torchtrain/metrics.py                        | 17 ++--
 torchtrain/models/llama/model.py             |  5 --
 torchtrain/parallelisms/__init__.py          |  8 +-
 torchtrain/parallelisms/parallelize_llama.py | 12 ++-
 torchtrain/profiling.py                      |  6 +-
 train.py                                     | 86 ++++++++++----------
 train_configs/debug_model.toml               |  4 +-
 train_configs/llama_13b.toml                 |  2 +-
 train_configs/llama_70b.toml                 |  2 +-
 train_configs/llama_7b.toml                  |  2 +-
 15 files changed, 99 insertions(+), 124 deletions(-)

diff --git a/torchtrain/checkpoint.py b/torchtrain/checkpoint.py
index bd141142..b660c240 100644
--- a/torchtrain/checkpoint.py
+++ b/torchtrain/checkpoint.py
@@ -17,7 +17,7 @@
     set_model_state_dict,
     set_optimizer_state_dict,
 )
-from torchtrain.logging_utils import rank0_log
+from torchtrain.logging_utils import logger
 
 
 class IntervalType(enum.Enum):
@@ -109,13 +109,13 @@ def save(self, curr_step: int, force: bool = False) -> None:
             self.work = None
             self.doit = None
 
-        rank0_log(f"Saving a checkpoint in step {curr_step}.")
+        logger.info(f"Saving a checkpoint at step {curr_step}")
         begin = time.monotonic()
         dcp.save(self.states, checkpoint_id=self.create_checkpoint_id(curr_step))
         self.reset()
-        rank0_log(
-            f"Finish saving the checkpoint in step {curr_step}. "
-            f"{time.monotonic() - begin} seconds"
+        logger.info(
+            f"Finished saving the checkpoint at step {curr_step} "
+            f"in {time.monotonic() - begin} seconds"
         )
 
     def load(self, step: int = -1) -> bool:
@@ -136,11 +136,13 @@ def load(self, step: int = -1) -> bool:
                 return False
             step = max(step_counts)
 
-        rank0_log("Loading a checkpoint.")
+        logger.info("Loading a checkpoint")
         begin = time.monotonic()
         dcp.load(
             self.states,
             checkpoint_id=self.create_checkpoint_id(step),
         )
-        rank0_log(f"Finish loading a checkpoint. {time.monotonic() - begin} seconds.")
+        logger.info(
+            f"Finished loading the checkpoint in {time.monotonic() - begin} seconds"
+        )
         return True
diff --git a/torchtrain/datasets/hf_datasets.py b/torchtrain/datasets/hf_datasets.py
index d8ecec09..63e0db04 100644
--- a/torchtrain/datasets/hf_datasets.py
+++ b/torchtrain/datasets/hf_datasets.py
@@ -7,8 +7,7 @@
 from torch.utils.data import DataLoader, IterableDataset
 
 from torchtrain.datasets.tokenizer import TokenizerIf
-from torchtrain.logging_utils import rank0_log
-from torchtrain.utils import Color
+from torchtrain.logging_utils import logger
 
 from datasets import load_dataset, load_from_disk
 from datasets.distributed import split_dataset_by_node
@@ -97,21 +96,17 @@ def __init__(
     ) -> None:
         if dataset_name not in _supported_datasets:
             raise ValueError(
-                f"Dataset {dataset_name} is not supported. Supported datasets are: {_supported_datasets.keys()}"
+                f"Dataset {dataset_name} is not supported. Supported datasets are: {_supported_datasets.keys()}."
             )
 
         # TODO: This is a temporary solution for small datasets like Alpaca.
         #       For larger datasets we need to use a more scalable approach.
         if dataset_path:
-            rank0_log(
-                f"{Color.green}Loading '{dataset_name}' dataset locally from {dataset_path}...{Color.reset}"
-            )
+            logger.info(f"Loading {dataset_name} dataset locally from {dataset_path}")
             ds = load_from_disk(dataset_path)
         else:
-            rank0_log(
-                f"{Color.green}Preparing '{dataset_name}' dataset from HuggingFace...{Color.reset}"
-            )
-            # Setting `streaming=True` works for large dataset, but the speed is slow.
+            logger.info(f"Preparing {dataset_name} dataset from HuggingFace")
+            # Setting `streaming=True` works for large dataset, but is slightly slower and unstable.
             # c4 is huge, and requires both streaming and language selection (we default to en)
             if dataset_name == "c4":
                 ds = load_dataset(
@@ -147,16 +142,12 @@ def __iter__(self):
                     label = x[1:]
                     yield input, label
             if not self.infinite:
-                rank0_log(
-                    f"{Color.red}WARNING:{Color.reset} dataset {Color.yellow}'{self.dataset_name}'{Color.reset} has "
-                    f"run out of data.{Color.reset}"
-                )
+                logger.warning(f"Dataset {self.dataset_name} has run out of data.")
                 break
             else:
-                # we are re-looping on the same dataset, warn user
-                rank0_log(
-                    f"{Color.red}WARNING:{Color.reset} dataset {Color.yellow}'{self.dataset_name}'{Color.reset} is "
-                    f"being re-looped. Loss related metrics might be misleading.{Color.reset}"
+                logger.warning(
+                    f"Dataset {self.dataset_name} is being re-looped. "
+                    "Loss related metrics might be misleading."
                 )
 
 
diff --git a/torchtrain/datasets/tokenizer.py b/torchtrain/datasets/tokenizer.py
index 07593b55..2f2f6fee 100644
--- a/torchtrain/datasets/tokenizer.py
+++ b/torchtrain/datasets/tokenizer.py
@@ -1,5 +1,5 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
-# All rights reserved.
+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
 #
 # This source code is licensed under the BSD-style license found in the
 # LICENSE file in the root directory of this source tree.
@@ -8,13 +8,11 @@
 
 import os
 from abc import ABC, abstractmethod
-from logging import getLogger
 from typing import List
 
 from sentencepiece import SentencePieceProcessor
 
-
-logger = getLogger()
+from torchtrain.logging_utils import logger
 
 
 class TokenizerIf(ABC):
@@ -40,6 +38,7 @@ def n_words(self) -> int:
 
 
 def create_tokenizer(tokenizer_type: str, tokenizer_path: str) -> TokenizerIf:
+    logger.info(f"Building {tokenizer_type} tokenizer locally from {tokenizer_path}")
     if tokenizer_type == "sentencepiece":
         return SentencePieceTokenizer(tokenizer_path)
     else:
@@ -47,19 +46,18 @@ def create_tokenizer(tokenizer_type: str, tokenizer_path: str) -> TokenizerIf:
 
 
 class SentencePieceTokenizer(TokenizerIf):
-    """tokenizing and encoding/decoding text using SentencePiece."""
+    """
+    Tokenizing and encoding/decoding text based on a SentencePiece model.
+
+    Args:
+        tokenizer_path (str): The path to the SentencePiece model file.
+    """
 
     def __init__(self, tokenizer_path: str):
-        """
-        Initializes the Tokenizer with a SentencePiece model.
 
-        Args:
-            tokenizer_path (str): The path to the SentencePiece model file.
-        """
         super().__init__(tokenizer_path)
         # reload tokenizer
         self.sp_model = SentencePieceProcessor(model_file=tokenizer_path)
-        logger.info(f"Reloaded SentencePiece model from {tokenizer_path}")
 
         # BOS / EOS token IDs
         self._n_words: int = self.sp_model.vocab_size()
@@ -67,7 +65,7 @@ def __init__(self, tokenizer_path: str):
         self.eos_id: int = self.sp_model.eos_id()
         self.pad_id: int = self.sp_model.pad_id()
         logger.info(
-            f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
+            f"SentencePieceTokenizer built: #words {self.n_words}, BOS ID {self.bos_id}, EOS ID {self.eos_id}"
         )
         assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
 
diff --git a/torchtrain/float8_linear.py b/torchtrain/float8_linear.py
index 10d60d9a..169129a6 100644
--- a/torchtrain/float8_linear.py
+++ b/torchtrain/float8_linear.py
@@ -5,9 +5,8 @@
 # All rights reserved
 
 from torchtrain.config_manager import JobConfig
-from torchtrain.logging_utils import rank0_log
+from torchtrain.logging_utils import logger
 from torchtrain.models.llama import Transformer
-from torchtrain.utils import Color
 
 
 def build_fp8_linear(model: Transformer, job_config: JobConfig):
@@ -42,4 +41,4 @@ def build_fp8_linear(model: Transformer, job_config: JobConfig):
 
         # Mutates the model inplace replacing instances of torch.nn.Linear with float8_linear_type
         swap_linear_with_float8_linear(model, float8_linear_type)
-        rank0_log(f"{Color.green}Using {linear_type} float8 linear layers{Color.reset}")
+        logger.info(f"Swapped to {linear_type} float8 linear layers")
diff --git a/torchtrain/logging_utils.py b/torchtrain/logging_utils.py
index f2ed30b8..dfd758a0 100644
--- a/torchtrain/logging_utils.py
+++ b/torchtrain/logging_utils.py
@@ -1,15 +1,12 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
+
 import logging
 
-import torch
 
 logger = logging.getLogger()
 
 
-def rank0_log(msg):
-    if torch.distributed.get_rank() == 0:
-        logger.info(msg)
-
-
 def init_logger():
     logger.setLevel(logging.INFO)
     ch = logging.StreamHandler()
diff --git a/torchtrain/metrics.py b/torchtrain/metrics.py
index 91a1e184..2f495d90 100644
--- a/torchtrain/metrics.py
+++ b/torchtrain/metrics.py
@@ -13,8 +13,7 @@
 import torch.nn as nn
 from torch.utils.tensorboard import SummaryWriter
 from torchtrain.config_manager import JobConfig
-
-from torchtrain.logging_utils import rank0_log
+from torchtrain.logging_utils import logger
 
 # note that GiB (gibibyte) is 1024, vs GB is 1000
 _gib_in_bytes = 1024 * 1024 * 1024
@@ -122,18 +121,18 @@ def get_current_stats(self, return_data: bool = False):
             self.device_active_memory_usage, self.device_capacity, precision=2
         )
 
-        display_str = ""
-        display_str += f"Current Memory: {self.device_name} ({self.device_index}): Reserved: {self.device_reserved_memory_pct}%, "
-        display_str += f"Alloc {self.device_alloc_memory_pct}%, Active: {self.device_active_memory_pct}%\n"
+        display_str = f"{self.device_name} ({self.device_index}). "
+        display_str += f"Current memory: reserved {self.device_reserved_memory_pct}%, "
+        display_str += f"alloc {self.device_alloc_memory_pct}%, active {self.device_active_memory_pct}%. "
 
         self.get_peak_stats(curr_mem)
 
         peak_active_pct = self.get_pct_memory(self.peak_active_memory)
         peak_allocated_pct = self.get_pct_memory(self.peak_allocated_memory)
         peak_reserved_pct = self.get_pct_memory(self.peak_reserved_memory)
-        display_str += f"Peak Memory: Reserved {peak_reserved_pct}%, Alloc {peak_allocated_pct}%, Active: {peak_active_pct}%\n"
+        display_str += f"Peak memory: reserved {peak_reserved_pct}%, alloc {peak_allocated_pct}%, active {peak_active_pct}%. "
 
-        display_str += f"num retries: {self.num_retries}, num ooms: {self.num_ooms}"
+        display_str += f"Num retries: {self.num_retries}. Num ooms: {self.num_ooms}."
         if self.num_retries > 0:
             display_str += f"\nWARNING: {self.num_retries} retries -- recommend lowering batch size for max performance\n"
 
@@ -224,8 +223,8 @@ def build_metric_logger(config: JobConfig, tag: Optional[str] = None):
 
     enable_tb = config.metrics.enable_tensorboard
     if enable_tb:
-        rank0_log(
-            f"Metrics logging active. Tensorboard logs will be saved at {log_dir}."
+        logger.info(
+            f"Metrics logging active. Tensorboard logs will be saved at {log_dir}"
         )
 
     rank_str = f"rank_{torch.distributed.get_rank()}"
diff --git a/torchtrain/models/llama/model.py b/torchtrain/models/llama/model.py
index 1ba505cf..da6a4e14 100644
--- a/torchtrain/models/llama/model.py
+++ b/torchtrain/models/llama/model.py
@@ -8,8 +8,6 @@
 import torch.nn.functional as F
 from torch import nn
 
-from torchtrain.logging_utils import rank0_log
-
 
 @dataclass
 class ModelArgs:
@@ -476,8 +474,6 @@ def __init__(self, model_args: ModelArgs):
 
         # self.reset_parameters()
 
-        rank0_log(f"Model built with: {self.model_args}")
-
     def reset_parameters(
         self,
     ):
@@ -493,7 +489,6 @@ def reset_parameters(
             a=-cutoff_factor * final_out_std,
             b=cutoff_factor * final_out_std,
         )
-        rank0_log("Model fully initialized via reset_params")
 
     def forward(self, tokens: torch.Tensor):
         """
diff --git a/torchtrain/parallelisms/__init__.py b/torchtrain/parallelisms/__init__.py
index fdcd938d..1226a9c7 100644
--- a/torchtrain/parallelisms/__init__.py
+++ b/torchtrain/parallelisms/__init__.py
@@ -1,17 +1,13 @@
 # Copyright (c) Meta Platforms, Inc. and affiliates.
 # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement.
 
-import logging
 from dataclasses import dataclass
 from functools import cached_property
 
 from torch.distributed.device_mesh import init_device_mesh
-
+from torchtrain.logging_utils import logger
 from torchtrain.parallelisms.parallelize_llama import parallelize_llama
 
-logger = logging.getLogger(__name__)
-
-
 models_parallelize_fns = {
     "llama": parallelize_llama,
 }
@@ -48,8 +44,8 @@ def build_mesh(self, device_type):
             if d > 1:
                 dims.append(d)
                 names.append(name)
-        names = tuple(names)
         logger.info(f"Building {len(dims)}-D device mesh with {names}, {dims}")
+        names = tuple(names)
         return init_device_mesh(device_type, dims, mesh_dim_names=names)
 
     @property
diff --git a/torchtrain/parallelisms/parallelize_llama.py b/torchtrain/parallelisms/parallelize_llama.py
index 34252b6b..6c72b046 100644
--- a/torchtrain/parallelisms/parallelize_llama.py
+++ b/torchtrain/parallelisms/parallelize_llama.py
@@ -4,7 +4,6 @@
 # this file applies the PTD parallelisms and various training techniques to the
 # llama model, i.e. activation checkpoint, etc.
 
-import logging
 from collections import defaultdict
 
 import torch
@@ -15,7 +14,6 @@
     Replicate,
     Shard,
 )
-
 from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
     checkpoint_wrapper as ptd_checkpoint_wrapper,
     CheckpointImpl,
@@ -34,11 +32,9 @@
     RowwiseParallel,
 )
 from torchtrain.config_manager import JobConfig
-from torchtrain.logging_utils import rank0_log
+from torchtrain.logging_utils import logger
 from torchtrain.meta_init import meta_to_real_init_fn
 
-logger = logging.getLogger(__name__)
-
 
 def distribute_rmsnorm(module, device_mesh):
     # temp sharding API until PTD API is added
@@ -195,7 +191,7 @@ def parallelize_llama(model, world_mesh, parallel_dims, job_config: JobConfig):
                 parallelize_plan=layer_plan,
             )
 
-        rank0_log("Applied Sequence Parallelism to the model...")
+        logger.info("Applied Sequence Parallelism to the model")
 
     if parallel_dims.dp_enabled:
         dp_mesh = world_mesh["dp"]
@@ -228,7 +224,7 @@ def parallelize_llama(model, world_mesh, parallel_dims, job_config: JobConfig):
             # wrap the rest layers with FSDP
             model = wrap(model)
 
-        rank0_log("Applied FSDP to the model...")
+        logger.info("Applied FSDP to the model")
     else:
         meta_to_real_init_fn(model)
         model.cuda()
@@ -236,4 +232,6 @@ def parallelize_llama(model, world_mesh, parallel_dims, job_config: JobConfig):
     # we have now moved from meta to device,
     # reset parameters for proper initialization
     model.reset_parameters()
+    logger.info("Model fully initialized via reset_parameters")
+
     return model
diff --git a/torchtrain/profiling.py b/torchtrain/profiling.py
index ed1a9b5a..847d967a 100644
--- a/torchtrain/profiling.py
+++ b/torchtrain/profiling.py
@@ -6,7 +6,7 @@
 
 import torch
 from torchtrain.config_manager import JobConfig
-from torchtrain.logging_utils import rank0_log
+from torchtrain.logging_utils import logger
 
 
 @contextlib.contextmanager
@@ -31,11 +31,11 @@ def trace_handler(prof):
             curr_trace_dir = os.path.join(trace_dir, curr_trace_dir_name)
             if not os.path.exists(curr_trace_dir):
                 os.makedirs(curr_trace_dir, exist_ok=True)
-            rank0_log(f"exporting profile traces to {curr_trace_dir}")
+            logger.info(f"Exporting profile traces to {curr_trace_dir}")
 
             prof.export_chrome_trace(f"{curr_trace_dir}/rank{rank}_trace.json")
 
-        rank0_log(f"Profiling active.  Traces will be saved at {trace_dir}")
+        logger.info(f"Profiling active. Traces will be saved at {trace_dir}")
 
         if not os.path.exists(trace_dir):
             os.makedirs(trace_dir, exist_ok=True)
diff --git a/train.py b/train.py
index 603364d0..69cea633 100644
--- a/train.py
+++ b/train.py
@@ -10,7 +10,6 @@
 
 import numpy as np
 
-# torch imports
 import torch
 import torch.nn.functional as F
 from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
@@ -19,18 +18,14 @@
 
 from torchtrain.checkpoint import CheckpointManager, IntervalType
 from torchtrain.config_manager import JobConfig
-
-# torchtrain related
 from torchtrain.datasets import create_tokenizer, dataloader_fn
 from torchtrain.float8_linear import build_fp8_linear
-from torchtrain.logging_utils import init_logger, rank0_log
+from torchtrain.logging_utils import init_logger, logger
 from torchtrain.lr_scheduling import get_lr_scheduler
 from torchtrain.meta_init import meta_model_init
 from torchtrain.metrics import build_metric_logger, get_num_params, GPUMemoryMonitor
-
 from torchtrain.models import model_name_to_cls, model_name_to_tokenizer, models_config
 from torchtrain.parallelisms import models_parallelize_fns, ParallelDims
-
 from torchtrain.profiling import maybe_run_profiler
 from torchtrain.utils import Color, dist_max, dist_mean
 
@@ -74,7 +69,7 @@ def build_optimizer(model, job_config: JobConfig):
             model.parameters(), lr=lr, betas=(0.9, 0.95), weight_decay=0.1
         )
     else:
-        raise NotImplementedError(f"optimizer {name} not added")
+        raise NotImplementedError(f"Optimizer {name} not added.")
 
     return optimizer
 
@@ -83,16 +78,18 @@ def build_grad_scaler(model):
     # apply gradient scaling if mixed precision training is enabled with fp16 param dtype
     if model.mixed_precision.param_dtype == torch.float16:
         enable_grad_scaling = True
-        rank0_log("Enabling gradient scaling for mixed precision training.")
+        logger.info("Enabling gradient scaling for mixed precision training")
     else:
         enable_grad_scaling = False
-        rank0_log("Gradient scaling not enabled.")
+        logger.info("Gradient scaling not enabled")
 
     return ShardedGradScaler(enabled=enable_grad_scaling)
 
 
 def main(job_config: JobConfig):
     init_logger()
+    logger.info(f"Starting job: {job_config.job.description}")
+
     # init world mesh
     world_size = int(os.environ["WORLD_SIZE"])
     parallel_dims = ParallelDims(
@@ -103,9 +100,9 @@ def main(job_config: JobConfig):
         enable_loss_parallel=job_config.training.enable_loss_parallel,
     )
     world_mesh = parallel_dims.build_mesh(device_type="cuda")
-    rank0_log(f"Starting job: {job_config.job.description}")
+
     model_name = job_config.model.name
-    rank0_log(f"Building {model_name}")
+
     # build tokenizer
     tokenizer_type = model_name_to_tokenizer[model_name]
     tokenizer = create_tokenizer(tokenizer_type, job_config.model.tokenizer_path)
@@ -127,17 +124,15 @@ def main(job_config: JobConfig):
         dp_degree,
         dp_rank,
     )
-    rank0_log(
-        f"{Color.green}Built Dataloader for '{job_config.training.dataset}' dataset.{Color.reset}"
-    )
 
-    # build model
+    # build model (using meta init)
     model_cls = model_name_to_cls[model_name]
     model_config = models_config[model_name][job_config.model.flavor]
     model_config.vocab_size = tokenizer.n_words
-
-    # build model using meta init
     with meta_model_init():
+        logger.info(
+            f"Building {model_name} {job_config.model.flavor} with {model_config}"
+        )
         model = model_cls.from_model_args(model_config)
 
     # apply fp8 linear module swap
@@ -146,21 +141,21 @@ def main(job_config: JobConfig):
 
     # log model size
     model_param_count = get_num_params(model)
-
     if _is_local_logging:
-        rank0_log(
-            f"{Color.blue}Model {model_name} {job_config.model.flavor} {Color.red}size: {model_param_count:,}"
-            f" total parameters{Color.reset}"
+        logger.info(
+            f"{Color.blue}Model {model_name} {job_config.model.flavor} "
+            f"{Color.red}size: {model_param_count:,} total parameters{Color.reset}"
         )
     else:
-        rank0_log(
+        logger.info(
             f"{model_name} {job_config.model.flavor} size: {model_param_count:,} total parameters"
         )
 
+    # initialize GPU memory monitor before applying parallelisms to the model
     gpu_metrics = GPUMemoryMonitor("cuda")
-    rank0_log(f"GPU memory usage: {gpu_metrics}")
+    logger.info(f"GPU memory initial condition: {gpu_metrics}")
 
-    # apply PTD parallelisms + AC
+    # apply PTD parallelisms + AC/selective AC
     model = models_parallelize_fns[model_name](
         model, world_mesh, parallel_dims, job_config
     )
@@ -168,7 +163,7 @@ def main(job_config: JobConfig):
     # to use FSDP-customized gradient scaler and gradient clipping solutions
     assert isinstance(model, FSDP)
 
-    # build optimizer after apply parallelisms to the model
+    # build optimizer after applying parallelisms to the model
     optimizer = build_optimizer(model, job_config)
     scheduler = get_lr_scheduler(optimizer, job_config)
 
@@ -182,7 +177,7 @@ def main(job_config: JobConfig):
             torch._dynamo.config._experimental_support_context_fn_in_torch_utils_checkpoint = (
                 True
             )
-        rank0_log(f"Compiling model {model_name} with torch.compile...")
+        logger.info("Compiling model with torch.compile")
         model = torch.compile(
             model,
         )
@@ -210,10 +205,12 @@ def main(job_config: JobConfig):
 
     with maybe_run_profiler(job_config) as torch_profiler:
         checkpoint.reset()
+
         # variables used to keep info for metrics logging
         losses_since_last_log: List[float] = []
         nwords_since_last_log = 0
         time_last_log = timer()
+
         while train_state.step < job_config.training.steps:
             train_state.step += 1
             # get batch
@@ -306,20 +303,20 @@ def main(job_config: JobConfig):
                 nwords_since_last_log = 0
                 time_last_log = timer()
 
-            if _is_local_logging:
-                rank0_log(
-                    f"{Color.cyan}step: {train_state.step:>2}  {Color.green}loss: {round(train_state.current_loss,4):>7}"
-                    f"  {Color.reset}iter: {Color.blue}{curr_iter_time:>7}{Color.reset}"
-                    f"  data: {Color.blue}{data_load_time:>5}  {Color.reset}"
-                    f"lr: {Color.yellow}{round(float(scheduler.get_last_lr()[0]), 8):<6}{Color.reset}"
-                )
-            else:
-                rank0_log(
-                    f"step: {train_state.step:>2}  loss: {round(train_state.current_loss,4):>7}"
-                    f"  iter: {curr_iter_time:>7}"
-                    f"  data: {data_load_time:>5}  "
-                    f"lr: {round(float(scheduler.get_last_lr()[0]), 8):<6}"
-                )
+                if _is_local_logging:
+                    logger.info(
+                        f"{Color.cyan}step: {train_state.step:2}  "
+                        f"{Color.green}loss: {global_avg_loss.item():7.4f}  "
+                        f"{Color.blue}wps: {round(wps):7,}  "
+                        f"{Color.yellow}peak_memory: {gpu_mem_stats.reserved_peak:5}%{Color.reset}"
+                    )
+                else:
+                    logger.info(
+                        f"step: {train_state.step:2}  "
+                        f"loss: {global_avg_loss.item():7.4f}  "
+                        f"wps: {round(wps):7,}  "
+                        f"peak_memory: {gpu_mem_stats.reserved_peak:5}%"
+                    )
 
             scheduler.step()
 
@@ -331,11 +328,14 @@ def main(job_config: JobConfig):
     # calc and show average iter time, disregard first three iterations (warmup)
     if len(train_state.iter_times) > 3:
         avg_iter_time = np.mean(train_state.iter_times[3:])
-        rank0_log(f"Average iter time: {avg_iter_time:.4f} seconds")
         avg_data_load_time = np.mean(train_state.data_load_times[3:])
-        rank0_log(f"Average data load time: {avg_data_load_time:.4f} seconds")
+        logger.info(
+            "Average time per iteration: "
+            f"training {avg_iter_time:.4f} seconds, "
+            f"data loading {avg_data_load_time:.4f} seconds"
+        )
 
-    rank0_log(f"{gpu_metrics.get_current_stats()}")
+    logger.info(f"GPU memory usage: {gpu_metrics.get_current_stats()}")
 
 
 if __name__ == "__main__":
diff --git a/train_configs/debug_model.toml b/train_configs/debug_model.toml
index 6efc5b9d..6289f010 100644
--- a/train_configs/debug_model.toml
+++ b/train_configs/debug_model.toml
@@ -1,7 +1,7 @@
 # TorchTrain Config.toml
 [job]
 dump_folder = "./outputs"
-description = "debug training"
+description = "LLaMA debug training"
 
 [profiling]
 run_profiler = true
@@ -12,7 +12,7 @@ profile_every_x_iter = 10
 [metrics]
 enable_tensorboard = true
 save_tb_folder = "tb"
-log_freq = 10
+log_freq = 1
 
 [model]
 name = "llama"
diff --git a/train_configs/llama_13b.toml b/train_configs/llama_13b.toml
index c095db19..0b31d9de 100644
--- a/train_configs/llama_13b.toml
+++ b/train_configs/llama_13b.toml
@@ -1,7 +1,7 @@
 # TorchTrain Config.toml
 [job]
 dump_folder = "./outputs"
-description = "llama 13b training"
+description = "LLaMA 13B training"
 
 [profiling]
 run_profiler = true
diff --git a/train_configs/llama_70b.toml b/train_configs/llama_70b.toml
index d56c4c52..b017f375 100644
--- a/train_configs/llama_70b.toml
+++ b/train_configs/llama_70b.toml
@@ -1,7 +1,7 @@
 # TorchTrain Config.toml
 [job]
 dump_folder = "./outputs"
-description = "llama 13b training"
+description = "LLaMA 70B training"
 
 [profiling]
 run_profiler = true
diff --git a/train_configs/llama_7b.toml b/train_configs/llama_7b.toml
index e7d1d90f..8ee38d01 100644
--- a/train_configs/llama_7b.toml
+++ b/train_configs/llama_7b.toml
@@ -1,7 +1,7 @@
 # TorchTrain Config.toml
 [job]
 dump_folder = "./outputs"
-description = "llama 7b training"
+description = "LLaMA 7B training"
 
 [profiling]
 run_profiler = true