diff --git a/megatron/training/training.py b/megatron/training/training.py index 4d00bd1c8a..d071673aae 100644 --- a/megatron/training/training.py +++ b/megatron/training/training.py @@ -76,7 +76,6 @@ reduce_max_stat_across_model_parallel_group, is_last_rank, print_rank_0, - print_rank_last, report_memory, unwrap_model, update_use_dist_ckpt, @@ -1073,7 +1072,7 @@ def training_log(loss_dict, total_loss_dict, learning_rate, decoupled_learning_r total_loss_dict[advanced_iters_key] = 0 total_loss_dict[skipped_iters_key] = 0 total_loss_dict[nan_iters_key] = 0 - print_rank_last(log_string) + print_rank_0(log_string) if report_memory_flag: # Report memory after optimizer state has been initialized. if torch.distributed.get_rank() == 0: @@ -1784,9 +1783,9 @@ def evaluate_and_print_results(prefix, forward_step_func, process_non_loss_data_func(collected_non_loss_data, iteration, writer) length = len(string) + 1 - print_rank_last('-' * length) - print_rank_last(string) - print_rank_last('-' * length) + print_rank_0('-' * length) + print_rank_0(string) + print_rank_0('-' * length) def cyclic_iter(iter):