From 23698614e4370d7f3fb6cd6ecf41b7c6a4e8a2ac Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Tue, 12 Mar 2024 17:58:51 -0700 Subject: [PATCH] improve logging ghstack-source-id: de61ec093b43a2ccbf1156c76ba81ecd698a6a8a Pull Request resolved: https://github.com/pytorch/torchtrain/pull/132 --- torchtrain/checkpoint.py | 16 ++-- torchtrain/datasets/hf_datasets.py | 27 ++---- torchtrain/datasets/tokenizer.py | 22 +++-- torchtrain/float8_linear.py | 5 +- torchtrain/logging_utils.py | 9 +- torchtrain/metrics.py | 17 ++-- torchtrain/models/llama/model.py | 5 -- torchtrain/parallelisms/__init__.py | 8 +- torchtrain/parallelisms/parallelize_llama.py | 12 ++- torchtrain/profiling.py | 6 +- train.py | 86 ++++++++++---------- train_configs/debug_model.toml | 4 +- train_configs/llama_13b.toml | 2 +- train_configs/llama_70b.toml | 2 +- train_configs/llama_7b.toml | 2 +- 15 files changed, 99 insertions(+), 124 deletions(-) diff --git a/torchtrain/checkpoint.py b/torchtrain/checkpoint.py index bd141142..b660c240 100644 --- a/torchtrain/checkpoint.py +++ b/torchtrain/checkpoint.py @@ -17,7 +17,7 @@ set_model_state_dict, set_optimizer_state_dict, ) -from torchtrain.logging_utils import rank0_log +from torchtrain.logging_utils import logger class IntervalType(enum.Enum): @@ -109,13 +109,13 @@ def save(self, curr_step: int, force: bool = False) -> None: self.work = None self.doit = None - rank0_log(f"Saving a checkpoint in step {curr_step}.") + logger.info(f"Saving a checkpoint at step {curr_step}") begin = time.monotonic() dcp.save(self.states, checkpoint_id=self.create_checkpoint_id(curr_step)) self.reset() - rank0_log( - f"Finish saving the checkpoint in step {curr_step}. " - f"{time.monotonic() - begin} seconds" + logger.info( + f"Finished saving the checkpoint at step {curr_step} " + f"in {time.monotonic() - begin} seconds" ) def load(self, step: int = -1) -> bool: @@ -136,11 +136,13 @@ def load(self, step: int = -1) -> bool: return False step = max(step_counts) - rank0_log("Loading a checkpoint.") + logger.info("Loading a checkpoint") begin = time.monotonic() dcp.load( self.states, checkpoint_id=self.create_checkpoint_id(step), ) - rank0_log(f"Finish loading a checkpoint. {time.monotonic() - begin} seconds.") + logger.info( + f"Finished loading the checkpoint in {time.monotonic() - begin} seconds" + ) return True diff --git a/torchtrain/datasets/hf_datasets.py b/torchtrain/datasets/hf_datasets.py index d8ecec09..63e0db04 100644 --- a/torchtrain/datasets/hf_datasets.py +++ b/torchtrain/datasets/hf_datasets.py @@ -7,8 +7,7 @@ from torch.utils.data import DataLoader, IterableDataset from torchtrain.datasets.tokenizer import TokenizerIf -from torchtrain.logging_utils import rank0_log -from torchtrain.utils import Color +from torchtrain.logging_utils import logger from datasets import load_dataset, load_from_disk from datasets.distributed import split_dataset_by_node @@ -97,21 +96,17 @@ def __init__( ) -> None: if dataset_name not in _supported_datasets: raise ValueError( - f"Dataset {dataset_name} is not supported. Supported datasets are: {_supported_datasets.keys()}" + f"Dataset {dataset_name} is not supported. Supported datasets are: {_supported_datasets.keys()}." ) # TODO: This is a temporary solution for small datasets like Alpaca. # For larger datasets we need to use a more scalable approach. if dataset_path: - rank0_log( - f"{Color.green}Loading '{dataset_name}' dataset locally from {dataset_path}...{Color.reset}" - ) + logger.info(f"Loading {dataset_name} dataset locally from {dataset_path}") ds = load_from_disk(dataset_path) else: - rank0_log( - f"{Color.green}Preparing '{dataset_name}' dataset from HuggingFace...{Color.reset}" - ) - # Setting `streaming=True` works for large dataset, but the speed is slow. + logger.info(f"Preparing {dataset_name} dataset from HuggingFace") + # Setting `streaming=True` works for large dataset, but is slightly slower and unstable. # c4 is huge, and requires both streaming and language selection (we default to en) if dataset_name == "c4": ds = load_dataset( @@ -147,16 +142,12 @@ def __iter__(self): label = x[1:] yield input, label if not self.infinite: - rank0_log( - f"{Color.red}WARNING:{Color.reset} dataset {Color.yellow}'{self.dataset_name}'{Color.reset} has " - f"run out of data.{Color.reset}" - ) + logger.warning(f"Dataset {self.dataset_name} has run out of data.") break else: - # we are re-looping on the same dataset, warn user - rank0_log( - f"{Color.red}WARNING:{Color.reset} dataset {Color.yellow}'{self.dataset_name}'{Color.reset} is " - f"being re-looped. Loss related metrics might be misleading.{Color.reset}" + logger.warning( + f"Dataset {self.dataset_name} is being re-looped. " + "Loss related metrics might be misleading." ) diff --git a/torchtrain/datasets/tokenizer.py b/torchtrain/datasets/tokenizer.py index 07593b55..2f2f6fee 100644 --- a/torchtrain/datasets/tokenizer.py +++ b/torchtrain/datasets/tokenizer.py @@ -1,5 +1,5 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. -# All rights reserved. +# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. @@ -8,13 +8,11 @@ import os from abc import ABC, abstractmethod -from logging import getLogger from typing import List from sentencepiece import SentencePieceProcessor - -logger = getLogger() +from torchtrain.logging_utils import logger class TokenizerIf(ABC): @@ -40,6 +38,7 @@ def n_words(self) -> int: def create_tokenizer(tokenizer_type: str, tokenizer_path: str) -> TokenizerIf: + logger.info(f"Building {tokenizer_type} tokenizer locally from {tokenizer_path}") if tokenizer_type == "sentencepiece": return SentencePieceTokenizer(tokenizer_path) else: @@ -47,19 +46,18 @@ def create_tokenizer(tokenizer_type: str, tokenizer_path: str) -> TokenizerIf: class SentencePieceTokenizer(TokenizerIf): - """tokenizing and encoding/decoding text using SentencePiece.""" + """ + Tokenizing and encoding/decoding text based on a SentencePiece model. + + Args: + tokenizer_path (str): The path to the SentencePiece model file. + """ def __init__(self, tokenizer_path: str): - """ - Initializes the Tokenizer with a SentencePiece model. - Args: - tokenizer_path (str): The path to the SentencePiece model file. - """ super().__init__(tokenizer_path) # reload tokenizer self.sp_model = SentencePieceProcessor(model_file=tokenizer_path) - logger.info(f"Reloaded SentencePiece model from {tokenizer_path}") # BOS / EOS token IDs self._n_words: int = self.sp_model.vocab_size() @@ -67,7 +65,7 @@ def __init__(self, tokenizer_path: str): self.eos_id: int = self.sp_model.eos_id() self.pad_id: int = self.sp_model.pad_id() logger.info( - f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}" + f"SentencePieceTokenizer built: #words {self.n_words}, BOS ID {self.bos_id}, EOS ID {self.eos_id}" ) assert self.sp_model.vocab_size() == self.sp_model.get_piece_size() diff --git a/torchtrain/float8_linear.py b/torchtrain/float8_linear.py index 10d60d9a..169129a6 100644 --- a/torchtrain/float8_linear.py +++ b/torchtrain/float8_linear.py @@ -5,9 +5,8 @@ # All rights reserved from torchtrain.config_manager import JobConfig -from torchtrain.logging_utils import rank0_log +from torchtrain.logging_utils import logger from torchtrain.models.llama import Transformer -from torchtrain.utils import Color def build_fp8_linear(model: Transformer, job_config: JobConfig): @@ -42,4 +41,4 @@ def build_fp8_linear(model: Transformer, job_config: JobConfig): # Mutates the model inplace replacing instances of torch.nn.Linear with float8_linear_type swap_linear_with_float8_linear(model, float8_linear_type) - rank0_log(f"{Color.green}Using {linear_type} float8 linear layers{Color.reset}") + logger.info(f"Swapped to {linear_type} float8 linear layers") diff --git a/torchtrain/logging_utils.py b/torchtrain/logging_utils.py index f2ed30b8..dfd758a0 100644 --- a/torchtrain/logging_utils.py +++ b/torchtrain/logging_utils.py @@ -1,15 +1,12 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. + import logging -import torch logger = logging.getLogger() -def rank0_log(msg): - if torch.distributed.get_rank() == 0: - logger.info(msg) - - def init_logger(): logger.setLevel(logging.INFO) ch = logging.StreamHandler() diff --git a/torchtrain/metrics.py b/torchtrain/metrics.py index 91a1e184..2f495d90 100644 --- a/torchtrain/metrics.py +++ b/torchtrain/metrics.py @@ -13,8 +13,7 @@ import torch.nn as nn from torch.utils.tensorboard import SummaryWriter from torchtrain.config_manager import JobConfig - -from torchtrain.logging_utils import rank0_log +from torchtrain.logging_utils import logger # note that GiB (gibibyte) is 1024, vs GB is 1000 _gib_in_bytes = 1024 * 1024 * 1024 @@ -122,18 +121,18 @@ def get_current_stats(self, return_data: bool = False): self.device_active_memory_usage, self.device_capacity, precision=2 ) - display_str = "" - display_str += f"Current Memory: {self.device_name} ({self.device_index}): Reserved: {self.device_reserved_memory_pct}%, " - display_str += f"Alloc {self.device_alloc_memory_pct}%, Active: {self.device_active_memory_pct}%\n" + display_str = f"{self.device_name} ({self.device_index}). " + display_str += f"Current memory: reserved {self.device_reserved_memory_pct}%, " + display_str += f"alloc {self.device_alloc_memory_pct}%, active {self.device_active_memory_pct}%. " self.get_peak_stats(curr_mem) peak_active_pct = self.get_pct_memory(self.peak_active_memory) peak_allocated_pct = self.get_pct_memory(self.peak_allocated_memory) peak_reserved_pct = self.get_pct_memory(self.peak_reserved_memory) - display_str += f"Peak Memory: Reserved {peak_reserved_pct}%, Alloc {peak_allocated_pct}%, Active: {peak_active_pct}%\n" + display_str += f"Peak memory: reserved {peak_reserved_pct}%, alloc {peak_allocated_pct}%, active {peak_active_pct}%. " - display_str += f"num retries: {self.num_retries}, num ooms: {self.num_ooms}" + display_str += f"Num retries: {self.num_retries}. Num ooms: {self.num_ooms}." if self.num_retries > 0: display_str += f"\nWARNING: {self.num_retries} retries -- recommend lowering batch size for max performance\n" @@ -224,8 +223,8 @@ def build_metric_logger(config: JobConfig, tag: Optional[str] = None): enable_tb = config.metrics.enable_tensorboard if enable_tb: - rank0_log( - f"Metrics logging active. Tensorboard logs will be saved at {log_dir}." + logger.info( + f"Metrics logging active. Tensorboard logs will be saved at {log_dir}" ) rank_str = f"rank_{torch.distributed.get_rank()}" diff --git a/torchtrain/models/llama/model.py b/torchtrain/models/llama/model.py index 1ba505cf..da6a4e14 100644 --- a/torchtrain/models/llama/model.py +++ b/torchtrain/models/llama/model.py @@ -8,8 +8,6 @@ import torch.nn.functional as F from torch import nn -from torchtrain.logging_utils import rank0_log - @dataclass class ModelArgs: @@ -476,8 +474,6 @@ def __init__(self, model_args: ModelArgs): # self.reset_parameters() - rank0_log(f"Model built with: {self.model_args}") - def reset_parameters( self, ): @@ -493,7 +489,6 @@ def reset_parameters( a=-cutoff_factor * final_out_std, b=cutoff_factor * final_out_std, ) - rank0_log("Model fully initialized via reset_params") def forward(self, tokens: torch.Tensor): """ diff --git a/torchtrain/parallelisms/__init__.py b/torchtrain/parallelisms/__init__.py index fdcd938d..1226a9c7 100644 --- a/torchtrain/parallelisms/__init__.py +++ b/torchtrain/parallelisms/__init__.py @@ -1,17 +1,13 @@ # Copyright (c) Meta Platforms, Inc. and affiliates. # This software may be used and distributed according to the terms of the Llama 2 Community License Agreement. -import logging from dataclasses import dataclass from functools import cached_property from torch.distributed.device_mesh import init_device_mesh - +from torchtrain.logging_utils import logger from torchtrain.parallelisms.parallelize_llama import parallelize_llama -logger = logging.getLogger(__name__) - - models_parallelize_fns = { "llama": parallelize_llama, } @@ -48,8 +44,8 @@ def build_mesh(self, device_type): if d > 1: dims.append(d) names.append(name) - names = tuple(names) logger.info(f"Building {len(dims)}-D device mesh with {names}, {dims}") + names = tuple(names) return init_device_mesh(device_type, dims, mesh_dim_names=names) @property diff --git a/torchtrain/parallelisms/parallelize_llama.py b/torchtrain/parallelisms/parallelize_llama.py index 34252b6b..6c72b046 100644 --- a/torchtrain/parallelisms/parallelize_llama.py +++ b/torchtrain/parallelisms/parallelize_llama.py @@ -4,7 +4,6 @@ # this file applies the PTD parallelisms and various training techniques to the # llama model, i.e. activation checkpoint, etc. -import logging from collections import defaultdict import torch @@ -15,7 +14,6 @@ Replicate, Shard, ) - from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import ( checkpoint_wrapper as ptd_checkpoint_wrapper, CheckpointImpl, @@ -34,11 +32,9 @@ RowwiseParallel, ) from torchtrain.config_manager import JobConfig -from torchtrain.logging_utils import rank0_log +from torchtrain.logging_utils import logger from torchtrain.meta_init import meta_to_real_init_fn -logger = logging.getLogger(__name__) - def distribute_rmsnorm(module, device_mesh): # temp sharding API until PTD API is added @@ -195,7 +191,7 @@ def parallelize_llama(model, world_mesh, parallel_dims, job_config: JobConfig): parallelize_plan=layer_plan, ) - rank0_log("Applied Sequence Parallelism to the model...") + logger.info("Applied Sequence Parallelism to the model") if parallel_dims.dp_enabled: dp_mesh = world_mesh["dp"] @@ -228,7 +224,7 @@ def parallelize_llama(model, world_mesh, parallel_dims, job_config: JobConfig): # wrap the rest layers with FSDP model = wrap(model) - rank0_log("Applied FSDP to the model...") + logger.info("Applied FSDP to the model") else: meta_to_real_init_fn(model) model.cuda() @@ -236,4 +232,6 @@ def parallelize_llama(model, world_mesh, parallel_dims, job_config: JobConfig): # we have now moved from meta to device, # reset parameters for proper initialization model.reset_parameters() + logger.info("Model fully initialized via reset_parameters") + return model diff --git a/torchtrain/profiling.py b/torchtrain/profiling.py index ed1a9b5a..847d967a 100644 --- a/torchtrain/profiling.py +++ b/torchtrain/profiling.py @@ -6,7 +6,7 @@ import torch from torchtrain.config_manager import JobConfig -from torchtrain.logging_utils import rank0_log +from torchtrain.logging_utils import logger @contextlib.contextmanager @@ -31,11 +31,11 @@ def trace_handler(prof): curr_trace_dir = os.path.join(trace_dir, curr_trace_dir_name) if not os.path.exists(curr_trace_dir): os.makedirs(curr_trace_dir, exist_ok=True) - rank0_log(f"exporting profile traces to {curr_trace_dir}") + logger.info(f"Exporting profile traces to {curr_trace_dir}") prof.export_chrome_trace(f"{curr_trace_dir}/rank{rank}_trace.json") - rank0_log(f"Profiling active. Traces will be saved at {trace_dir}") + logger.info(f"Profiling active. Traces will be saved at {trace_dir}") if not os.path.exists(trace_dir): os.makedirs(trace_dir, exist_ok=True) diff --git a/train.py b/train.py index 603364d0..69cea633 100644 --- a/train.py +++ b/train.py @@ -10,7 +10,6 @@ import numpy as np -# torch imports import torch import torch.nn.functional as F from torch.distributed.fsdp import FullyShardedDataParallel as FSDP @@ -19,18 +18,14 @@ from torchtrain.checkpoint import CheckpointManager, IntervalType from torchtrain.config_manager import JobConfig - -# torchtrain related from torchtrain.datasets import create_tokenizer, dataloader_fn from torchtrain.float8_linear import build_fp8_linear -from torchtrain.logging_utils import init_logger, rank0_log +from torchtrain.logging_utils import init_logger, logger from torchtrain.lr_scheduling import get_lr_scheduler from torchtrain.meta_init import meta_model_init from torchtrain.metrics import build_metric_logger, get_num_params, GPUMemoryMonitor - from torchtrain.models import model_name_to_cls, model_name_to_tokenizer, models_config from torchtrain.parallelisms import models_parallelize_fns, ParallelDims - from torchtrain.profiling import maybe_run_profiler from torchtrain.utils import Color, dist_max, dist_mean @@ -74,7 +69,7 @@ def build_optimizer(model, job_config: JobConfig): model.parameters(), lr=lr, betas=(0.9, 0.95), weight_decay=0.1 ) else: - raise NotImplementedError(f"optimizer {name} not added") + raise NotImplementedError(f"Optimizer {name} not added.") return optimizer @@ -83,16 +78,18 @@ def build_grad_scaler(model): # apply gradient scaling if mixed precision training is enabled with fp16 param dtype if model.mixed_precision.param_dtype == torch.float16: enable_grad_scaling = True - rank0_log("Enabling gradient scaling for mixed precision training.") + logger.info("Enabling gradient scaling for mixed precision training") else: enable_grad_scaling = False - rank0_log("Gradient scaling not enabled.") + logger.info("Gradient scaling not enabled") return ShardedGradScaler(enabled=enable_grad_scaling) def main(job_config: JobConfig): init_logger() + logger.info(f"Starting job: {job_config.job.description}") + # init world mesh world_size = int(os.environ["WORLD_SIZE"]) parallel_dims = ParallelDims( @@ -103,9 +100,9 @@ def main(job_config: JobConfig): enable_loss_parallel=job_config.training.enable_loss_parallel, ) world_mesh = parallel_dims.build_mesh(device_type="cuda") - rank0_log(f"Starting job: {job_config.job.description}") + model_name = job_config.model.name - rank0_log(f"Building {model_name}") + # build tokenizer tokenizer_type = model_name_to_tokenizer[model_name] tokenizer = create_tokenizer(tokenizer_type, job_config.model.tokenizer_path) @@ -127,17 +124,15 @@ def main(job_config: JobConfig): dp_degree, dp_rank, ) - rank0_log( - f"{Color.green}Built Dataloader for '{job_config.training.dataset}' dataset.{Color.reset}" - ) - # build model + # build model (using meta init) model_cls = model_name_to_cls[model_name] model_config = models_config[model_name][job_config.model.flavor] model_config.vocab_size = tokenizer.n_words - - # build model using meta init with meta_model_init(): + logger.info( + f"Building {model_name} {job_config.model.flavor} with {model_config}" + ) model = model_cls.from_model_args(model_config) # apply fp8 linear module swap @@ -146,21 +141,21 @@ def main(job_config: JobConfig): # log model size model_param_count = get_num_params(model) - if _is_local_logging: - rank0_log( - f"{Color.blue}Model {model_name} {job_config.model.flavor} {Color.red}size: {model_param_count:,}" - f" total parameters{Color.reset}" + logger.info( + f"{Color.blue}Model {model_name} {job_config.model.flavor} " + f"{Color.red}size: {model_param_count:,} total parameters{Color.reset}" ) else: - rank0_log( + logger.info( f"{model_name} {job_config.model.flavor} size: {model_param_count:,} total parameters" ) + # initialize GPU memory monitor before applying parallelisms to the model gpu_metrics = GPUMemoryMonitor("cuda") - rank0_log(f"GPU memory usage: {gpu_metrics}") + logger.info(f"GPU memory initial condition: {gpu_metrics}") - # apply PTD parallelisms + AC + # apply PTD parallelisms + AC/selective AC model = models_parallelize_fns[model_name]( model, world_mesh, parallel_dims, job_config ) @@ -168,7 +163,7 @@ def main(job_config: JobConfig): # to use FSDP-customized gradient scaler and gradient clipping solutions assert isinstance(model, FSDP) - # build optimizer after apply parallelisms to the model + # build optimizer after applying parallelisms to the model optimizer = build_optimizer(model, job_config) scheduler = get_lr_scheduler(optimizer, job_config) @@ -182,7 +177,7 @@ def main(job_config: JobConfig): torch._dynamo.config._experimental_support_context_fn_in_torch_utils_checkpoint = ( True ) - rank0_log(f"Compiling model {model_name} with torch.compile...") + logger.info("Compiling model with torch.compile") model = torch.compile( model, ) @@ -210,10 +205,12 @@ def main(job_config: JobConfig): with maybe_run_profiler(job_config) as torch_profiler: checkpoint.reset() + # variables used to keep info for metrics logging losses_since_last_log: List[float] = [] nwords_since_last_log = 0 time_last_log = timer() + while train_state.step < job_config.training.steps: train_state.step += 1 # get batch @@ -306,20 +303,20 @@ def main(job_config: JobConfig): nwords_since_last_log = 0 time_last_log = timer() - if _is_local_logging: - rank0_log( - f"{Color.cyan}step: {train_state.step:>2} {Color.green}loss: {round(train_state.current_loss,4):>7}" - f" {Color.reset}iter: {Color.blue}{curr_iter_time:>7}{Color.reset}" - f" data: {Color.blue}{data_load_time:>5} {Color.reset}" - f"lr: {Color.yellow}{round(float(scheduler.get_last_lr()[0]), 8):<6}{Color.reset}" - ) - else: - rank0_log( - f"step: {train_state.step:>2} loss: {round(train_state.current_loss,4):>7}" - f" iter: {curr_iter_time:>7}" - f" data: {data_load_time:>5} " - f"lr: {round(float(scheduler.get_last_lr()[0]), 8):<6}" - ) + if _is_local_logging: + logger.info( + f"{Color.cyan}step: {train_state.step:2} " + f"{Color.green}loss: {global_avg_loss.item():7.4f} " + f"{Color.blue}wps: {round(wps):7,} " + f"{Color.yellow}peak_memory: {gpu_mem_stats.reserved_peak:5}%{Color.reset}" + ) + else: + logger.info( + f"step: {train_state.step:2} " + f"loss: {global_avg_loss.item():7.4f} " + f"wps: {round(wps):7,} " + f"peak_memory: {gpu_mem_stats.reserved_peak:5}%" + ) scheduler.step() @@ -331,11 +328,14 @@ def main(job_config: JobConfig): # calc and show average iter time, disregard first three iterations (warmup) if len(train_state.iter_times) > 3: avg_iter_time = np.mean(train_state.iter_times[3:]) - rank0_log(f"Average iter time: {avg_iter_time:.4f} seconds") avg_data_load_time = np.mean(train_state.data_load_times[3:]) - rank0_log(f"Average data load time: {avg_data_load_time:.4f} seconds") + logger.info( + "Average time per iteration: " + f"training {avg_iter_time:.4f} seconds, " + f"data loading {avg_data_load_time:.4f} seconds" + ) - rank0_log(f"{gpu_metrics.get_current_stats()}") + logger.info(f"GPU memory usage: {gpu_metrics.get_current_stats()}") if __name__ == "__main__": diff --git a/train_configs/debug_model.toml b/train_configs/debug_model.toml index 6efc5b9d..6289f010 100644 --- a/train_configs/debug_model.toml +++ b/train_configs/debug_model.toml @@ -1,7 +1,7 @@ # TorchTrain Config.toml [job] dump_folder = "./outputs" -description = "debug training" +description = "LLaMA debug training" [profiling] run_profiler = true @@ -12,7 +12,7 @@ profile_every_x_iter = 10 [metrics] enable_tensorboard = true save_tb_folder = "tb" -log_freq = 10 +log_freq = 1 [model] name = "llama" diff --git a/train_configs/llama_13b.toml b/train_configs/llama_13b.toml index c095db19..0b31d9de 100644 --- a/train_configs/llama_13b.toml +++ b/train_configs/llama_13b.toml @@ -1,7 +1,7 @@ # TorchTrain Config.toml [job] dump_folder = "./outputs" -description = "llama 13b training" +description = "LLaMA 13B training" [profiling] run_profiler = true diff --git a/train_configs/llama_70b.toml b/train_configs/llama_70b.toml index d56c4c52..b017f375 100644 --- a/train_configs/llama_70b.toml +++ b/train_configs/llama_70b.toml @@ -1,7 +1,7 @@ # TorchTrain Config.toml [job] dump_folder = "./outputs" -description = "llama 13b training" +description = "LLaMA 70B training" [profiling] run_profiler = true diff --git a/train_configs/llama_7b.toml b/train_configs/llama_7b.toml index e7d1d90f..8ee38d01 100644 --- a/train_configs/llama_7b.toml +++ b/train_configs/llama_7b.toml @@ -1,7 +1,7 @@ # TorchTrain Config.toml [job] dump_folder = "./outputs" -description = "llama 7b training" +description = "LLaMA 7B training" [profiling] run_profiler = true