From bd17e7796a53ec83b0be5403e48014ce78e79c59 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Mon, 29 Jul 2024 17:23:25 -0700
Subject: [PATCH] [NeMo-UX] Adding recipes (#9720) (#9851)

* Adding recipy proposal


* Adding more recipies


* Apply isort and black reformatting


* Remove api.py inside llm.gpt.model


* Adding resume to FineTuneRecipy


* Fix spelling error


* Fix spelling error


* Fix spelling error


* Apply isort and black reformatting


* Adding resume to PreTrainRecipe


* update recipe proposal to use sdk.Partial


* Apply isort and black reformatting


* update __init__


* update __init__


* fix return type


* Fix bug in factory


* rename recipe folder to 'models'


* Fixes


* Apply isort and black reformatting


* Bug fixes


* rename models --> configs


* Apply isort and black reformatting


* rename configs --> recipes


* Apply isort and black reformatting


* address comments


---------

Signed-off-by: ashors1 <ashors@nvidia.com>
Signed-off-by: artbataev <artbataev@users.noreply.github.com>
Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com>
Signed-off-by: ashors1 <ashors1@users.noreply.github.com>
Signed-off-by: Hemil Desai <hemild@nvidia.com>
Signed-off-by: hemildesai <hemildesai@users.noreply.github.com>
Co-authored-by: Marc Romeyn <mromeijn@nvidia.com>
Co-authored-by: artbataev <artbataev@users.noreply.github.com>
Co-authored-by: marcromeyn <marcromeyn@users.noreply.github.com>
Co-authored-by: ashors1 <ashors@nvidia.com>
Co-authored-by: ashors1 <ashors1@users.noreply.github.com>
Co-authored-by: Hemil Desai <hemild@nvidia.com>
Co-authored-by: hemildesai <hemildesai@users.noreply.github.com>
Co-authored-by: Anna Shors <71393111+ashors1@users.noreply.github.com>
---
 nemo/collections/llm/__init__.py              |  35 +----
 nemo/collections/llm/gpt/model/api.py         | 125 ------------------
 nemo/collections/llm/recipes/__init__.py      |  13 ++
 nemo/collections/llm/recipes/llama2_7b.py     |  61 +++++++++
 nemo/collections/llm/recipes/llama3_8b.py     |  61 +++++++++
 nemo/collections/llm/recipes/llama3_8b_16k.py |  45 +++++++
 nemo/collections/llm/recipes/llama3_8b_64k.py |  45 +++++++
 nemo/collections/llm/recipes/log/__init__.py  |   0
 nemo/collections/llm/recipes/log/default.py   |  15 +++
 nemo/collections/llm/recipes/mistral.py       |  61 +++++++++
 .../collections/llm/recipes/optim/__init__.py |   0
 nemo/collections/llm/recipes/optim/adam.py    |  16 +++
 nemo/collections/llm/utils.py                 |   5 +-
 13 files changed, 320 insertions(+), 162 deletions(-)
 delete mode 100644 nemo/collections/llm/gpt/model/api.py
 create mode 100644 nemo/collections/llm/recipes/__init__.py
 create mode 100644 nemo/collections/llm/recipes/llama2_7b.py
 create mode 100644 nemo/collections/llm/recipes/llama3_8b.py
 create mode 100644 nemo/collections/llm/recipes/llama3_8b_16k.py
 create mode 100644 nemo/collections/llm/recipes/llama3_8b_64k.py
 create mode 100644 nemo/collections/llm/recipes/log/__init__.py
 create mode 100644 nemo/collections/llm/recipes/log/default.py
 create mode 100644 nemo/collections/llm/recipes/mistral.py
 create mode 100644 nemo/collections/llm/recipes/optim/__init__.py
 create mode 100644 nemo/collections/llm/recipes/optim/adam.py

diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
index 83c0a3af48c0..b5283ee8a1c9 100644
--- a/nemo/collections/llm/__init__.py
+++ b/nemo/collections/llm/__init__.py
@@ -42,24 +42,7 @@
     gpt_data_step,
     gpt_forward_step,
 )
-from nemo.collections.llm.gpt.model.api import (
-    code_gemma_2b,
-    code_gemma_7b,
-    code_llama_7b,
-    code_llama_13b,
-    code_llama_34b,
-    code_llama_70b,
-    gemma,
-    gemma_2b,
-    gemma_7b,
-    llama2_7b,
-    llama2_13b,
-    llama2_70b,
-    llama3_8b,
-    llama3_70b,
-    mistral,
-    mixtral,
-)
+from nemo.collections.llm.recipes import *  # noqa
 
 __all__ = [
     "MockDataModule",
@@ -103,21 +86,5 @@
     "mock",
     "squad",
     "dolly",
-    "mistral",
-    "mixtral",
-    "llama2_7b",
-    "llama3_8b",
-    "llama2_13b",
-    "llama2_70b",
-    "llama3_70b",
-    "code_llama_7b",
-    "code_llama_13b",
-    "code_llama_34b",
-    "code_llama_70b",
-    "gemma",
-    "gemma_2b",
-    "gemma_7b",
-    "code_gemma_2b",
-    "code_gemma_7b",
     "peft",
 ]
diff --git a/nemo/collections/llm/gpt/model/api.py b/nemo/collections/llm/gpt/model/api.py
deleted file mode 100644
index 7c8cbf4d02e6..000000000000
--- a/nemo/collections/llm/gpt/model/api.py
+++ /dev/null
@@ -1,125 +0,0 @@
-import pytorch_lightning as pl
-
-from nemo.collections.llm.gpt.model.gemma import (
-    CodeGemmaConfig2B,
-    CodeGemmaConfig7B,
-    GemmaConfig,
-    GemmaConfig2B,
-    GemmaConfig7B,
-    GemmaModel,
-)
-from nemo.collections.llm.gpt.model.llama import (
-    CodeLlamaConfig7B,
-    CodeLlamaConfig13B,
-    CodeLlamaConfig34B,
-    CodeLlamaConfig70B,
-    Llama2Config7B,
-    Llama2Config13B,
-    Llama2Config70B,
-    Llama3Config8B,
-    Llama3Config70B,
-    LlamaModel,
-)
-from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralModel
-from nemo.collections.llm.gpt.model.mixtral import MixtralConfig8x7B, MixtralModel
-from nemo.collections.llm.utils import factory
-
-
-@factory
-def mistral() -> pl.LightningModule:
-    return MistralModel(MistralConfig7B())
-
-
-@factory
-def mixtral() -> pl.LightningModule:
-    return MixtralModel(MixtralConfig8x7B())
-
-
-@factory
-def llama2_7b() -> pl.LightningModule:
-    return LlamaModel(Llama2Config7B())
-
-
-@factory
-def llama3_8b() -> pl.LightningModule:
-    return LlamaModel(Llama3Config8B())
-
-
-@factory
-def llama2_13b() -> pl.LightningModule:
-    return LlamaModel(Llama2Config13B())
-
-
-@factory
-def llama2_70b() -> pl.LightningModule:
-    return LlamaModel(Llama2Config70B())
-
-
-@factory
-def llama3_70b() -> pl.LightningModule:
-    return LlamaModel(Llama3Config70B())
-
-
-@factory
-def code_llama_7b() -> pl.LightningModule:
-    return LlamaModel(CodeLlamaConfig7B())
-
-
-@factory
-def code_llama_13b() -> pl.LightningModule:
-    return LlamaModel(CodeLlamaConfig13B())
-
-
-@factory
-def code_llama_34b() -> pl.LightningModule:
-    return LlamaModel(CodeLlamaConfig34B())
-
-
-@factory
-def code_llama_70b() -> pl.LightningModule:
-    return LlamaModel(CodeLlamaConfig70B())
-
-
-@factory
-def gemma() -> pl.LightningModule:
-    return GemmaModel(GemmaConfig())
-
-
-@factory
-def gemma_2b() -> pl.LightningModule:
-    return GemmaModel(GemmaConfig2B())
-
-
-@factory
-def gemma_7b() -> pl.LightningModule:
-    return GemmaModel(GemmaConfig7B())
-
-
-@factory
-def code_gemma_2b() -> pl.LightningModule:
-    return GemmaModel(CodeGemmaConfig2B())
-
-
-@factory
-def code_gemma_7b() -> pl.LightningModule:
-    return GemmaModel(CodeGemmaConfig7B())
-
-
-__all__ = [
-    "mistral",
-    "mixtral",
-    "llama2_7b",
-    "llama3_8b",
-    "llama2_13b",
-    "llama2_70b",
-    "llama3_70b",
-    "code_llama_7b",
-    "code_llama_13b",
-    "code_llama_34b",
-    "code_llama_70b",
-    "gemma",
-    "gemma_2b",
-    "gemma_7b",
-    "code_gemma_2b",
-    "code_gemma_7b",
-]
diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py
new file mode 100644
index 000000000000..8d4d874362a9
--- /dev/null
+++ b/nemo/collections/llm/recipes/__init__.py
@@ -0,0 +1,13 @@
+from nemo.collections.llm.recipes import llama2_7b, llama3_8b, llama3_8b_16k, llama3_8b_64k, mistral
+from nemo.collections.llm.recipes.log.default import default_log
+from nemo.collections.llm.recipes.optim import adam
+
+__all__ = [
+    "llama3_8b",
+    "llama3_8b_16k",
+    "llama3_8b_64k",
+    "llama2_7b",
+    "mistral",
+    "adam",
+    "default_log",
+]
diff --git a/nemo/collections/llm/recipes/llama2_7b.py b/nemo/collections/llm/recipes/llama2_7b.py
new file mode 100644
index 000000000000..1767dc4690c8
--- /dev/null
+++ b/nemo/collections/llm/recipes/llama2_7b.py
@@ -0,0 +1,61 @@
+import pytorch_lightning as pl
+
+from nemo import lightning as nl
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.api import squad
+from nemo.collections.llm.gpt.model.llama import Llama2Config7B, LlamaModel
+from nemo.collections.llm.peft.api import gpt_lora
+from nemo.collections.llm.recipes.log.default import default_log
+from nemo.collections.llm.recipes.optim.adam import adam_with_cosine_annealing
+from nemo.collections.llm.utils import Partial, factory
+
+NAME = "llama2_7b"
+
+
+@factory(name=NAME)
+def model() -> pl.LightningModule:
+    return LlamaModel(Llama2Config7B())
+
+
+@factory(name=NAME)
+def trainer(devices=8) -> nl.Trainer:
+    strategy = nl.MegatronStrategy(tensor_model_parallel_size=2)
+
+    return nl.Trainer(
+        devices=devices,
+        max_steps=100,
+        accelerator="gpu",
+        strategy=strategy,
+        plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"),
+    )
+
+
+@factory(name=NAME + "_hf")
+def hf_resume() -> nl.AutoResume:
+    return nl.AutoResume(import_path="hf://meta-llama/Llama-2-7b-hf")
+
+
+@factory(name=NAME, for_task="llm.pretrain")
+def pretrain_recipe() -> Partial:
+    return Partial(
+        pretrain,
+        model=model,
+        trainer=trainer,
+        data=squad,
+        log=default_log,
+        optim=adam_with_cosine_annealing,
+    )
+
+
+@factory(name=NAME, for_task="llm.finetune")
+def finetune_recipe() -> Partial:
+    return Partial(
+        finetune,
+        model=model,
+        trainer=trainer,
+        data=squad,
+        log=default_log,
+        optim=adam_with_cosine_annealing,
+        peft=gpt_lora,
+        resume=hf_resume,
+    )
diff --git a/nemo/collections/llm/recipes/llama3_8b.py b/nemo/collections/llm/recipes/llama3_8b.py
new file mode 100644
index 000000000000..34ce418a0701
--- /dev/null
+++ b/nemo/collections/llm/recipes/llama3_8b.py
@@ -0,0 +1,61 @@
+import pytorch_lightning as pl
+
+from nemo import lightning as nl
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.api import squad
+from nemo.collections.llm.gpt.model.llama import Llama3Config8B, LlamaModel
+from nemo.collections.llm.peft.api import gpt_lora
+from nemo.collections.llm.recipes.log.default import default_log
+from nemo.collections.llm.recipes.optim.adam import adam_with_cosine_annealing
+from nemo.collections.llm.utils import Partial, factory
+
+NAME = "llama3_8b"
+
+
+@factory(name=NAME)
+def model() -> pl.LightningModule:
+    return LlamaModel(Llama3Config8B(seq_length=16384))
+
+
+@factory(name=NAME)
+def trainer(devices=8) -> nl.Trainer:
+    strategy = nl.MegatronStrategy(tensor_model_parallel_size=2)
+
+    return nl.Trainer(
+        devices=devices,
+        max_steps=100,
+        accelerator="gpu",
+        strategy=strategy,
+        plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"),
+    )
+
+
+@factory(name=NAME + "_hf")
+def hf_resume() -> nl.AutoResume:
+    return nl.AutoResume(import_path="hf://meta-llama/Meta-Llama-3-8B")
+
+
+@factory(name=NAME, for_task="llm.pretrain")
+def pretrain_recipe() -> Partial:
+    return Partial(
+        pretrain,
+        model=model,
+        trainer=trainer,
+        data=squad,
+        log=default_log,
+        optim=adam_with_cosine_annealing,
+    )
+
+
+@factory(name=NAME, for_task="llm.finetune")
+def finetune_recipe() -> Partial:
+    return Partial(
+        finetune,
+        model=model,
+        trainer=trainer,
+        data=squad,
+        log=default_log,
+        optim=adam_with_cosine_annealing,
+        peft=gpt_lora,
+        resume=hf_resume,
+    )
diff --git a/nemo/collections/llm/recipes/llama3_8b_16k.py b/nemo/collections/llm/recipes/llama3_8b_16k.py
new file mode 100644
index 000000000000..3a590f26894e
--- /dev/null
+++ b/nemo/collections/llm/recipes/llama3_8b_16k.py
@@ -0,0 +1,45 @@
+import pytorch_lightning as pl
+
+from nemo import lightning as nl
+from nemo.collections.llm.api import pretrain
+from nemo.collections.llm.gpt.data.api import squad
+from nemo.collections.llm.gpt.model.llama import Llama3Config8B, LlamaModel
+from nemo.collections.llm.recipes.log.default import default_log
+from nemo.collections.llm.recipes.optim.adam import adam_with_cosine_annealing
+from nemo.collections.llm.utils import Partial, factory
+
+NAME = "llama3_8b_16k"
+
+
+@factory(name=NAME)
+def model() -> pl.LightningModule:
+    return LlamaModel(Llama3Config8B(seq_length=16384))
+
+
+@factory(name=NAME)
+def trainer(devices=8) -> nl.Trainer:
+    strategy = nl.MegatronStrategy(
+        tensor_model_parallel_size=4,
+        context_parallel_size=2,
+        sequence_parallel=True,
+    )
+
+    return nl.Trainer(
+        devices=devices,
+        max_steps=100,
+        accelerator="gpu",
+        strategy=strategy,
+        plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"),
+    )
+
+
+@factory(name=NAME, for_task="llm.pretrain")
+def pretrain_recipe() -> Partial:
+    return Partial(
+        pretrain,
+        model=model,
+        trainer=trainer,
+        data=squad,
+        log=default_log,
+        optim=adam_with_cosine_annealing,
+    )
diff --git a/nemo/collections/llm/recipes/llama3_8b_64k.py b/nemo/collections/llm/recipes/llama3_8b_64k.py
new file mode 100644
index 000000000000..c826feb28901
--- /dev/null
+++ b/nemo/collections/llm/recipes/llama3_8b_64k.py
@@ -0,0 +1,45 @@
+import pytorch_lightning as pl
+
+from nemo import lightning as nl
+from nemo.collections.llm.api import pretrain
+from nemo.collections.llm.gpt.data.api import squad
+from nemo.collections.llm.gpt.model.llama import Llama3Config8B, LlamaModel
+from nemo.collections.llm.recipes.log.default import default_log
+from nemo.collections.llm.recipes.optim.adam import adam_with_cosine_annealing
+from nemo.collections.llm.utils import Partial, factory
+
+NAME = "llama3_8b_64k"
+
+
+@factory(name=NAME)
+def model() -> pl.LightningModule:
+    return LlamaModel(Llama3Config8B(seq_length=65536))
+
+
+@factory(name=NAME)
+def trainer(devices=8) -> nl.Trainer:
+    strategy = nl.MegatronStrategy(
+        tensor_model_parallel_size=8,
+        context_parallel_size=4,
+        sequence_parallel=True,
+    )
+
+    return nl.Trainer(
+        devices=devices,
+        max_steps=100,
+        accelerator="gpu",
+        strategy=strategy,
+        plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"),
+    )
+
+
+@factory(name=NAME, for_task="llm.pretrain")
+def pretrain_recipe() -> Partial:
+    return Partial(
+        pretrain,
+        model=model,
+        trainer=trainer,
+        data=squad,
+        log=default_log,
+        optim=adam_with_cosine_annealing,
+    )
diff --git a/nemo/collections/llm/recipes/log/__init__.py b/nemo/collections/llm/recipes/log/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/nemo/collections/llm/recipes/log/default.py b/nemo/collections/llm/recipes/log/default.py
new file mode 100644
index 000000000000..a40e141bfa95
--- /dev/null
+++ b/nemo/collections/llm/recipes/log/default.py
@@ -0,0 +1,15 @@
+from nemo import lightning as nl
+from nemo.collections.llm.utils import factory
+
+
+@factory
+def default_log() -> nl.NeMoLogger:
+    ckpt = nl.ModelCheckpoint(
+        save_best_model=True,
+        save_last=True,
+        monitor="reduced_train_loss",
+        save_top_k=2,
+        save_on_train_epoch_end=True,
+    )
+
+    return nl.NeMoLogger(ckpt=ckpt)
diff --git a/nemo/collections/llm/recipes/mistral.py b/nemo/collections/llm/recipes/mistral.py
new file mode 100644
index 000000000000..12af8d5d18ff
--- /dev/null
+++ b/nemo/collections/llm/recipes/mistral.py
@@ -0,0 +1,61 @@
+import pytorch_lightning as pl
+
+from nemo import lightning as nl
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.api import squad
+from nemo.collections.llm.gpt.model.mistral import MistralConfig7B, MistralModel
+from nemo.collections.llm.peft.api import gpt_lora
+from nemo.collections.llm.recipes.log.default import default_log
+from nemo.collections.llm.recipes.optim.adam import adam_with_cosine_annealing
+from nemo.collections.llm.utils import Partial, factory
+
+NAME = "mistral"
+
+
+@factory(name=NAME)
+def model() -> pl.LightningModule:
+    return MistralModel(MistralConfig7B())
+
+
+@factory(name=NAME)
+def trainer(devices=8) -> nl.Trainer:
+    strategy = nl.MegatronStrategy(tensor_model_parallel_size=2)
+
+    return nl.Trainer(
+        devices=devices,
+        max_steps=100,
+        accelerator="gpu",
+        strategy=strategy,
+        plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"),
+    )
+
+
+@factory(name=NAME + "_hf")
+def hf_resume() -> nl.AutoResume:
+    return nl.AutoResume(import_path="hf://mistralai/Mistral-7B-v0.3")
+
+
+@factory(name=NAME, for_task="llm.pretrain")
+def pretrain_recipe() -> Partial:
+    return Partial(
+        pretrain,
+        model=model,
+        trainer=trainer,
+        data=squad,
+        log=default_log,
+        optim=adam_with_cosine_annealing,
+    )
+
+
+@factory(name=NAME, for_task="llm.finetune")
+def finetune_recipe() -> Partial:
+    return Partial(
+        finetune,
+        model=model,
+        trainer=trainer,
+        data=squad,
+        log=default_log,
+        optim=adam_with_cosine_annealing,
+        peft=gpt_lora,
+        resume=hf_resume,
+    )
diff --git a/nemo/collections/llm/recipes/optim/__init__.py b/nemo/collections/llm/recipes/optim/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/nemo/collections/llm/recipes/optim/adam.py b/nemo/collections/llm/recipes/optim/adam.py
new file mode 100644
index 000000000000..4229001b2130
--- /dev/null
+++ b/nemo/collections/llm/recipes/optim/adam.py
@@ -0,0 +1,16 @@
+from megatron.core.optimizer import OptimizerConfig
+
+from nemo import lightning as nl
+from nemo.collections.llm.utils import factory
+
+
+@factory
+def adam_with_cosine_annealing() -> nl.OptimizerModule:
+    return nl.MegatronOptimizerModule(
+        config=OptimizerConfig(optimizer="adam", lr=0.001, use_distributed_optimizer=True),
+        lr_scheduler=nl.lr_scheduler.CosineAnnealingScheduler(),
+    )
+
+
+# TODO: Fix the name-arg inside the factory-function so we don't need to do this
+with_cosine_annealing = adam_with_cosine_annealing
diff --git a/nemo/collections/llm/utils.py b/nemo/collections/llm/utils.py
index b4382d0afd5f..5c087f60590a 100644
--- a/nemo/collections/llm/utils.py
+++ b/nemo/collections/llm/utils.py
@@ -42,9 +42,8 @@ def factory(*args: Any, **kwargs: Any) -> Union[Callable[[T], T], T]:
     try:
         import nemo_sdk as sdk
 
-        if not args and not kwargs:
-            # Used as @factory without arguments
-            return sdk.factory()
+        if not args:
+            return sdk.factory(**kwargs)
         else:
             # Used as @factory(*args, **kwargs)
             return sdk.factory(*args, **kwargs)