From bd17e7796a53ec83b0be5403e48014ce78e79c59 Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 29 Jul 2024 17:23:25 -0700 Subject: [PATCH] [NeMo-UX] Adding recipes (#9720) (#9851) * Adding recipy proposal * Adding more recipies * Apply isort and black reformatting * Remove api.py inside llm.gpt.model * Adding resume to FineTuneRecipy * Fix spelling error * Fix spelling error * Fix spelling error * Apply isort and black reformatting * Adding resume to PreTrainRecipe * update recipe proposal to use sdk.Partial * Apply isort and black reformatting * update __init__ * update __init__ * fix return type * Fix bug in factory * rename recipe folder to 'models' * Fixes * Apply isort and black reformatting * Bug fixes * rename models --> configs * Apply isort and black reformatting * rename configs --> recipes * Apply isort and black reformatting * address comments --------- Signed-off-by: ashors1 Signed-off-by: artbataev Signed-off-by: marcromeyn Signed-off-by: ashors1 Signed-off-by: Hemil Desai Signed-off-by: hemildesai Co-authored-by: Marc Romeyn Co-authored-by: artbataev Co-authored-by: marcromeyn Co-authored-by: ashors1 Co-authored-by: ashors1 Co-authored-by: Hemil Desai Co-authored-by: hemildesai Co-authored-by: Anna Shors <71393111+ashors1@users.noreply.github.com> --- nemo/collections/llm/__init__.py | 35 +---- nemo/collections/llm/gpt/model/api.py | 125 ------------------ nemo/collections/llm/recipes/__init__.py | 13 ++ nemo/collections/llm/recipes/llama2_7b.py | 61 +++++++++ nemo/collections/llm/recipes/llama3_8b.py | 61 +++++++++ nemo/collections/llm/recipes/llama3_8b_16k.py | 45 +++++++ nemo/collections/llm/recipes/llama3_8b_64k.py | 45 +++++++ nemo/collections/llm/recipes/log/__init__.py | 0 nemo/collections/llm/recipes/log/default.py | 15 +++ nemo/collections/llm/recipes/mistral.py | 61 +++++++++ .../collections/llm/recipes/optim/__init__.py | 0 nemo/collections/llm/recipes/optim/adam.py | 16 +++ nemo/collections/llm/utils.py | 5 +- 13 files changed, 320 insertions(+), 162 deletions(-) delete mode 100644 nemo/collections/llm/gpt/model/api.py create mode 100644 nemo/collections/llm/recipes/__init__.py create mode 100644 nemo/collections/llm/recipes/llama2_7b.py create mode 100644 nemo/collections/llm/recipes/llama3_8b.py create mode 100644 nemo/collections/llm/recipes/llama3_8b_16k.py create mode 100644 nemo/collections/llm/recipes/llama3_8b_64k.py create mode 100644 nemo/collections/llm/recipes/log/__init__.py create mode 100644 nemo/collections/llm/recipes/log/default.py create mode 100644 nemo/collections/llm/recipes/mistral.py create mode 100644 nemo/collections/llm/recipes/optim/__init__.py create mode 100644 nemo/collections/llm/recipes/optim/adam.py diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py index 83c0a3af48c0..b5283ee8a1c9 100644 --- a/nemo/collections/llm/__init__.py +++ b/nemo/collections/llm/__init__.py @@ -42,24 +42,7 @@ gpt_data_step, gpt_forward_step, ) -from nemo.collections.llm.gpt.model.api import ( - code_gemma_2b, - code_gemma_7b, - code_llama_7b, - code_llama_13b, - code_llama_34b, - code_llama_70b, - gemma, - gemma_2b, - gemma_7b, - llama2_7b, - llama2_13b, - llama2_70b, - llama3_8b, - llama3_70b, - mistral, - mixtral, -) +from nemo.collections.llm.recipes import * # noqa __all__ = [ "MockDataModule", @@ -103,21 +86,5 @@ "mock", "squad", "dolly", - "mistral", - "mixtral", - "llama2_7b", - "llama3_8b", - "llama2_13b", - "llama2_70b", - "llama3_70b", - "code_llama_7b", - "code_llama_13b", - "code_llama_34b", - "code_llama_70b", - "gemma", - "gemma_2b", - "gemma_7b", - "code_gemma_2b", - "code_gemma_7b", "peft", ] diff --git a/nemo/collections/llm/gpt/model/api.py b/nemo/collections/llm/gpt/model/api.py deleted file mode 100644 index 7c8cbf4d02e6..000000000000 --- a/nemo/collections/llm/gpt/model/api.py +++ /dev/null @@ -1,125 +0,0 @@ -import pytorch_lightning as pl - -from nemo.collections.llm.gpt.model.gemma import ( - CodeGemmaConfig2B, - CodeGemmaConfig7B, - GemmaConfig, - GemmaConfig2B, - GemmaConfig7B, - GemmaModel, -) -from nemo.collections.llm.gpt.model.llama import ( - CodeLlamaConfig7B, - CodeLlamaConfig13B, - CodeLlamaConfig34B, - CodeLlamaConfig70B, - Llama2Config7B, - Llama2Config13B, - Llama2Config70B, - Llama3Config8B, - Llama3Config70B, - LlamaModel, -) -from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralModel -from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x7B, MixtralModel -from nemo.collections.llm.utils import factory - - -@factory -def mistral() -> pl.LightningModule: - return MistralModel(MistralConfig7B()) - - -@factory -def mixtral() -> pl.LightningModule: - return MixtralModel(MixtralConfig8x7B()) - - -@factory -def llama2_7b() -> pl.LightningModule: - return LlamaModel(Llama2Config7B()) - - -@factory -def llama3_8b() -> pl.LightningModule: - return LlamaModel(Llama3Config8B()) - - -@factory -def llama2_13b() -> pl.LightningModule: - return LlamaModel(Llama2Config13B()) - - -@factory -def llama2_70b() -> pl.LightningModule: - return LlamaModel(Llama2Config70B()) - - -@factory -def llama3_70b() -> pl.LightningModule: - return LlamaModel(Llama3Config70B()) - - -@factory -def code_llama_7b() -> pl.LightningModule: - return LlamaModel(CodeLlamaConfig7B()) - - -@factory -def code_llama_13b() -> pl.LightningModule: - return LlamaModel(CodeLlamaConfig13B()) - - -@factory -def code_llama_34b() -> pl.LightningModule: - return LlamaModel(CodeLlamaConfig34B()) - - -@factory -def code_llama_70b() -> pl.LightningModule: - return LlamaModel(CodeLlamaConfig70B()) - - -@factory -def gemma() -> pl.LightningModule: - return GemmaModel(GemmaConfig()) - - -@factory -def gemma_2b() -> pl.LightningModule: - return GemmaModel(GemmaConfig2B()) - - -@factory -def gemma_7b() -> pl.LightningModule: - return GemmaModel(GemmaConfig7B()) - - -@factory -def code_gemma_2b() -> pl.LightningModule: - return GemmaModel(CodeGemmaConfig2B()) - - -@factory -def code_gemma_7b() -> pl.LightningModule: - return GemmaModel(CodeGemmaConfig7B()) - - -__all__ = [ - "mistral", - "mixtral", - "llama2_7b", - "llama3_8b", - "llama2_13b", - "llama2_70b", - "llama3_70b", - "code_llama_7b", - "code_llama_13b", - "code_llama_34b", - "code_llama_70b", - "gemma", - "gemma_2b", - "gemma_7b", - "code_gemma_2b", - "code_gemma_7b", -] diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py new file mode 100644 index 000000000000..8d4d874362a9 --- /dev/null +++ b/nemo/collections/llm/recipes/__init__.py @@ -0,0 +1,13 @@ +from nemo.collections.llm.recipes import llama2_7b, llama3_8b, llama3_8b_16k, llama3_8b_64k, mistral +from nemo.collections.llm.recipes.log.default import default_log +from nemo.collections.llm.recipes.optim import adam + +__all__ = [ + "llama3_8b", + "llama3_8b_16k", + "llama3_8b_64k", + "llama2_7b", + "mistral", + "adam", + "default_log", +] diff --git a/nemo/collections/llm/recipes/llama2_7b.py b/nemo/collections/llm/recipes/llama2_7b.py new file mode 100644 index 000000000000..1767dc4690c8 --- /dev/null +++ b/nemo/collections/llm/recipes/llama2_7b.py @@ -0,0 +1,61 @@ +import pytorch_lightning as pl + +from nemo import lightning as nl +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.api import squad +from nemo.collections.llm.gpt.model.llama import Llama2Config7B, LlamaModel +from nemo.collections.llm.peft.api import gpt_lora +from nemo.collections.llm.recipes.log.default import default_log +from nemo.collections.llm.recipes.optim.adam import adam_with_cosine_annealing +from nemo.collections.llm.utils import Partial, factory + +NAME = "llama2_7b" + + +@factory(name=NAME) +def model() -> pl.LightningModule: + return LlamaModel(Llama2Config7B()) + + +@factory(name=NAME) +def trainer(devices=8) -> nl.Trainer: + strategy = nl.MegatronStrategy(tensor_model_parallel_size=2) + + return nl.Trainer( + devices=devices, + max_steps=100, + accelerator="gpu", + strategy=strategy, + plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), + ) + + +@factory(name=NAME + "_hf") +def hf_resume() -> nl.AutoResume: + return nl.AutoResume(import_path="hf://meta-llama/Llama-2-7b-hf") + + +@factory(name=NAME, for_task="llm.pretrain") +def pretrain_recipe() -> Partial: + return Partial( + pretrain, + model=model, + trainer=trainer, + data=squad, + log=default_log, + optim=adam_with_cosine_annealing, + ) + + +@factory(name=NAME, for_task="llm.finetune") +def finetune_recipe() -> Partial: + return Partial( + finetune, + model=model, + trainer=trainer, + data=squad, + log=default_log, + optim=adam_with_cosine_annealing, + peft=gpt_lora, + resume=hf_resume, + ) diff --git a/nemo/collections/llm/recipes/llama3_8b.py b/nemo/collections/llm/recipes/llama3_8b.py new file mode 100644 index 000000000000..34ce418a0701 --- /dev/null +++ b/nemo/collections/llm/recipes/llama3_8b.py @@ -0,0 +1,61 @@ +import pytorch_lightning as pl + +from nemo import lightning as nl +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.api import squad +from nemo.collections.llm.gpt.model.llama import Llama3Config8B, LlamaModel +from nemo.collections.llm.peft.api import gpt_lora +from nemo.collections.llm.recipes.log.default import default_log +from nemo.collections.llm.recipes.optim.adam import adam_with_cosine_annealing +from nemo.collections.llm.utils import Partial, factory + +NAME = "llama3_8b" + + +@factory(name=NAME) +def model() -> pl.LightningModule: + return LlamaModel(Llama3Config8B(seq_length=16384)) + + +@factory(name=NAME) +def trainer(devices=8) -> nl.Trainer: + strategy = nl.MegatronStrategy(tensor_model_parallel_size=2) + + return nl.Trainer( + devices=devices, + max_steps=100, + accelerator="gpu", + strategy=strategy, + plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), + ) + + +@factory(name=NAME + "_hf") +def hf_resume() -> nl.AutoResume: + return nl.AutoResume(import_path="hf://meta-llama/Meta-Llama-3-8B") + + +@factory(name=NAME, for_task="llm.pretrain") +def pretrain_recipe() -> Partial: + return Partial( + pretrain, + model=model, + trainer=trainer, + data=squad, + log=default_log, + optim=adam_with_cosine_annealing, + ) + + +@factory(name=NAME, for_task="llm.finetune") +def finetune_recipe() -> Partial: + return Partial( + finetune, + model=model, + trainer=trainer, + data=squad, + log=default_log, + optim=adam_with_cosine_annealing, + peft=gpt_lora, + resume=hf_resume, + ) diff --git a/nemo/collections/llm/recipes/llama3_8b_16k.py b/nemo/collections/llm/recipes/llama3_8b_16k.py new file mode 100644 index 000000000000..3a590f26894e --- /dev/null +++ b/nemo/collections/llm/recipes/llama3_8b_16k.py @@ -0,0 +1,45 @@ +import pytorch_lightning as pl + +from nemo import lightning as nl +from nemo.collections.llm.api import pretrain +from nemo.collections.llm.gpt.data.api import squad +from nemo.collections.llm.gpt.model.llama import Llama3Config8B, LlamaModel +from nemo.collections.llm.recipes.log.default import default_log +from nemo.collections.llm.recipes.optim.adam import adam_with_cosine_annealing +from nemo.collections.llm.utils import Partial, factory + +NAME = "llama3_8b_16k" + + +@factory(name=NAME) +def model() -> pl.LightningModule: + return LlamaModel(Llama3Config8B(seq_length=16384)) + + +@factory(name=NAME) +def trainer(devices=8) -> nl.Trainer: + strategy = nl.MegatronStrategy( + tensor_model_parallel_size=4, + context_parallel_size=2, + sequence_parallel=True, + ) + + return nl.Trainer( + devices=devices, + max_steps=100, + accelerator="gpu", + strategy=strategy, + plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), + ) + + +@factory(name=NAME, for_task="llm.pretrain") +def pretrain_recipe() -> Partial: + return Partial( + pretrain, + model=model, + trainer=trainer, + data=squad, + log=default_log, + optim=adam_with_cosine_annealing, + ) diff --git a/nemo/collections/llm/recipes/llama3_8b_64k.py b/nemo/collections/llm/recipes/llama3_8b_64k.py new file mode 100644 index 000000000000..c826feb28901 --- /dev/null +++ b/nemo/collections/llm/recipes/llama3_8b_64k.py @@ -0,0 +1,45 @@ +import pytorch_lightning as pl + +from nemo import lightning as nl +from nemo.collections.llm.api import pretrain +from nemo.collections.llm.gpt.data.api import squad +from nemo.collections.llm.gpt.model.llama import Llama3Config8B, LlamaModel +from nemo.collections.llm.recipes.log.default import default_log +from nemo.collections.llm.recipes.optim.adam import adam_with_cosine_annealing +from nemo.collections.llm.utils import Partial, factory + +NAME = "llama3_8b_64k" + + +@factory(name=NAME) +def model() -> pl.LightningModule: + return LlamaModel(Llama3Config8B(seq_length=65536)) + + +@factory(name=NAME) +def trainer(devices=8) -> nl.Trainer: + strategy = nl.MegatronStrategy( + tensor_model_parallel_size=8, + context_parallel_size=4, + sequence_parallel=True, + ) + + return nl.Trainer( + devices=devices, + max_steps=100, + accelerator="gpu", + strategy=strategy, + plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), + ) + + +@factory(name=NAME, for_task="llm.pretrain") +def pretrain_recipe() -> Partial: + return Partial( + pretrain, + model=model, + trainer=trainer, + data=squad, + log=default_log, + optim=adam_with_cosine_annealing, + ) diff --git a/nemo/collections/llm/recipes/log/__init__.py b/nemo/collections/llm/recipes/log/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/nemo/collections/llm/recipes/log/default.py b/nemo/collections/llm/recipes/log/default.py new file mode 100644 index 000000000000..a40e141bfa95 --- /dev/null +++ b/nemo/collections/llm/recipes/log/default.py @@ -0,0 +1,15 @@ +from nemo import lightning as nl +from nemo.collections.llm.utils import factory + + +@factory +def default_log() -> nl.NeMoLogger: + ckpt = nl.ModelCheckpoint( + save_best_model=True, + save_last=True, + monitor="reduced_train_loss", + save_top_k=2, + save_on_train_epoch_end=True, + ) + + return nl.NeMoLogger(ckpt=ckpt) diff --git a/nemo/collections/llm/recipes/mistral.py b/nemo/collections/llm/recipes/mistral.py new file mode 100644 index 000000000000..12af8d5d18ff --- /dev/null +++ b/nemo/collections/llm/recipes/mistral.py @@ -0,0 +1,61 @@ +import pytorch_lightning as pl + +from nemo import lightning as nl +from nemo.collections.llm.api import finetune, pretrain +from nemo.collections.llm.gpt.data.api import squad +from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralModel +from nemo.collections.llm.peft.api import gpt_lora +from nemo.collections.llm.recipes.log.default import default_log +from nemo.collections.llm.recipes.optim.adam import adam_with_cosine_annealing +from nemo.collections.llm.utils import Partial, factory + +NAME = "mistral" + + +@factory(name=NAME) +def model() -> pl.LightningModule: + return MistralModel(MistralConfig7B()) + + +@factory(name=NAME) +def trainer(devices=8) -> nl.Trainer: + strategy = nl.MegatronStrategy(tensor_model_parallel_size=2) + + return nl.Trainer( + devices=devices, + max_steps=100, + accelerator="gpu", + strategy=strategy, + plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"), + ) + + +@factory(name=NAME + "_hf") +def hf_resume() -> nl.AutoResume: + return nl.AutoResume(import_path="hf://mistralai/Mistral-7B-v0.3") + + +@factory(name=NAME, for_task="llm.pretrain") +def pretrain_recipe() -> Partial: + return Partial( + pretrain, + model=model, + trainer=trainer, + data=squad, + log=default_log, + optim=adam_with_cosine_annealing, + ) + + +@factory(name=NAME, for_task="llm.finetune") +def finetune_recipe() -> Partial: + return Partial( + finetune, + model=model, + trainer=trainer, + data=squad, + log=default_log, + optim=adam_with_cosine_annealing, + peft=gpt_lora, + resume=hf_resume, + ) diff --git a/nemo/collections/llm/recipes/optim/__init__.py b/nemo/collections/llm/recipes/optim/__init__.py new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/nemo/collections/llm/recipes/optim/adam.py b/nemo/collections/llm/recipes/optim/adam.py new file mode 100644 index 000000000000..4229001b2130 --- /dev/null +++ b/nemo/collections/llm/recipes/optim/adam.py @@ -0,0 +1,16 @@ +from megatron.core.optimizer import OptimizerConfig + +from nemo import lightning as nl +from nemo.collections.llm.utils import factory + + +@factory +def adam_with_cosine_annealing() -> nl.OptimizerModule: + return nl.MegatronOptimizerModule( + config=OptimizerConfig(optimizer="adam", lr=0.001, use_distributed_optimizer=True), + lr_scheduler=nl.lr_scheduler.CosineAnnealingScheduler(), + ) + + +# TODO: Fix the name-arg inside the factory-function so we don't need to do this +with_cosine_annealing = adam_with_cosine_annealing diff --git a/nemo/collections/llm/utils.py b/nemo/collections/llm/utils.py index b4382d0afd5f..5c087f60590a 100644 --- a/nemo/collections/llm/utils.py +++ b/nemo/collections/llm/utils.py @@ -42,9 +42,8 @@ def factory(*args: Any, **kwargs: Any) -> Union[Callable[[T], T], T]: try: import nemo_sdk as sdk - if not args and not kwargs: - # Used as @factory without arguments - return sdk.factory() + if not args: + return sdk.factory(**kwargs) else: # Used as @factory(*args, **kwargs) return sdk.factory(*args, **kwargs)