[NeMo-UX] Adding recipes (NVIDIA#9720) (NVIDIA#9851)

* Adding recipy proposal * Adding more recipies * Apply isort and black reformatting * Remove api.py inside llm.gpt.model * Adding resume to FineTuneRecipy * Fix spelling error * Fix spelling error * Fix spelling error * Apply isort and black reformatting * Adding resume to PreTrainRecipe * update recipe proposal to use sdk.Partial * Apply isort and black reformatting * update __init__ * update __init__ * fix return type * Fix bug in factory * rename recipe folder to 'models' * Fixes * Apply isort and black reformatting * Bug fixes * rename models --> configs * Apply isort and black reformatting * rename configs --> recipes * Apply isort and black reformatting * address comments --------- Signed-off-by: ashors1 <ashors@nvidia.com> Signed-off-by: artbataev <artbataev@users.noreply.github.com> Signed-off-by: marcromeyn <marcromeyn@users.noreply.github.com> Signed-off-by: ashors1 <ashors1@users.noreply.github.com> Signed-off-by: Hemil Desai <hemild@nvidia.com> Signed-off-by: hemildesai <hemildesai@users.noreply.github.com> Co-authored-by: Marc Romeyn <mromeijn@nvidia.com> Co-authored-by: artbataev <artbataev@users.noreply.github.com> Co-authored-by: marcromeyn <marcromeyn@users.noreply.github.com> Co-authored-by: ashors1 <ashors@nvidia.com> Co-authored-by: ashors1 <ashors1@users.noreply.github.com> Co-authored-by: Hemil Desai <hemild@nvidia.com> Co-authored-by: hemildesai <hemildesai@users.noreply.github.com> Co-authored-by: Anna Shors <71393111+ashors1@users.noreply.github.com>
andrusenkoau · Jul 30, 2024 · bd17e77 · bd17e77
1 parent e201b00
commit bd17e77
Show file tree

Hide file tree

Showing 13 changed files with 320 additions and 162 deletions.
diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
@@ -42,24 +42,7 @@
     gpt_data_step,
     gpt_forward_step,
 )
-from nemo.collections.llm.gpt.model.api import (
-    code_gemma_2b,
-    code_gemma_7b,
-    code_llama_7b,
-    code_llama_13b,
-    code_llama_34b,
-    code_llama_70b,
-    gemma,
-    gemma_2b,
-    gemma_7b,
-    llama2_7b,
-    llama2_13b,
-    llama2_70b,
-    llama3_8b,
-    llama3_70b,
-    mistral,
-    mixtral,
-)
+from nemo.collections.llm.recipes import *  # noqa
 
 __all__ = [
     "MockDataModule",
@@ -103,21 +86,5 @@
     "mock",
     "squad",
     "dolly",
-    "mistral",
-    "mixtral",
-    "llama2_7b",
-    "llama3_8b",
-    "llama2_13b",
-    "llama2_70b",
-    "llama3_70b",
-    "code_llama_7b",
-    "code_llama_13b",
-    "code_llama_34b",
-    "code_llama_70b",
-    "gemma",
-    "gemma_2b",
-    "gemma_7b",
-    "code_gemma_2b",
-    "code_gemma_7b",
     "peft",
 ]
diff --git a/nemo/collections/llm/gpt/model/api.py b/nemo/collections/llm/gpt/model/api.py
diff --git a/nemo/collections/llm/recipes/__init__.py b/nemo/collections/llm/recipes/__init__.py
@@ -0,0 +1,13 @@
+from nemo.collections.llm.recipes import llama2_7b, llama3_8b, llama3_8b_16k, llama3_8b_64k, mistral
+from nemo.collections.llm.recipes.log.default import default_log
+from nemo.collections.llm.recipes.optim import adam
+
+__all__ = [
+    "llama3_8b",
+    "llama3_8b_16k",
+    "llama3_8b_64k",
+    "llama2_7b",
+    "mistral",
+    "adam",
+    "default_log",
+]
diff --git a/nemo/collections/llm/recipes/llama2_7b.py b/nemo/collections/llm/recipes/llama2_7b.py
@@ -0,0 +1,61 @@
+import pytorch_lightning as pl
+
+from nemo import lightning as nl
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.api import squad
+from nemo.collections.llm.gpt.model.llama import Llama2Config7B, LlamaModel
+from nemo.collections.llm.peft.api import gpt_lora
+from nemo.collections.llm.recipes.log.default import default_log
+from nemo.collections.llm.recipes.optim.adam import adam_with_cosine_annealing
+from nemo.collections.llm.utils import Partial, factory
+
+NAME = "llama2_7b"
+
+
+@factory(name=NAME)
+def model() -> pl.LightningModule:
+    return LlamaModel(Llama2Config7B())
+
+
+@factory(name=NAME)
+def trainer(devices=8) -> nl.Trainer:
+    strategy = nl.MegatronStrategy(tensor_model_parallel_size=2)
+
+    return nl.Trainer(
+        devices=devices,
+        max_steps=100,
+        accelerator="gpu",
+        strategy=strategy,
+        plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"),
+    )
+
+
+@factory(name=NAME + "_hf")
+def hf_resume() -> nl.AutoResume:
+    return nl.AutoResume(import_path="hf://meta-llama/Llama-2-7b-hf")
+
+
+@factory(name=NAME, for_task="llm.pretrain")
+def pretrain_recipe() -> Partial:
+    return Partial(
+        pretrain,
+        model=model,
+        trainer=trainer,
+        data=squad,
+        log=default_log,
+        optim=adam_with_cosine_annealing,
+    )
+
+
+@factory(name=NAME, for_task="llm.finetune")
+def finetune_recipe() -> Partial:
+    return Partial(
+        finetune,
+        model=model,
+        trainer=trainer,
+        data=squad,
+        log=default_log,
+        optim=adam_with_cosine_annealing,
+        peft=gpt_lora,
+        resume=hf_resume,
+    )
diff --git a/nemo/collections/llm/recipes/llama3_8b.py b/nemo/collections/llm/recipes/llama3_8b.py
@@ -0,0 +1,61 @@
+import pytorch_lightning as pl
+
+from nemo import lightning as nl
+from nemo.collections.llm.api import finetune, pretrain
+from nemo.collections.llm.gpt.data.api import squad
+from nemo.collections.llm.gpt.model.llama import Llama3Config8B, LlamaModel
+from nemo.collections.llm.peft.api import gpt_lora
+from nemo.collections.llm.recipes.log.default import default_log
+from nemo.collections.llm.recipes.optim.adam import adam_with_cosine_annealing
+from nemo.collections.llm.utils import Partial, factory
+
+NAME = "llama3_8b"
+
+
+@factory(name=NAME)
+def model() -> pl.LightningModule:
+    return LlamaModel(Llama3Config8B(seq_length=16384))
+
+
+@factory(name=NAME)
+def trainer(devices=8) -> nl.Trainer:
+    strategy = nl.MegatronStrategy(tensor_model_parallel_size=2)
+
+    return nl.Trainer(
+        devices=devices,
+        max_steps=100,
+        accelerator="gpu",
+        strategy=strategy,
+        plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"),
+    )
+
+
+@factory(name=NAME + "_hf")
+def hf_resume() -> nl.AutoResume:
+    return nl.AutoResume(import_path="hf://meta-llama/Meta-Llama-3-8B")
+
+
+@factory(name=NAME, for_task="llm.pretrain")
+def pretrain_recipe() -> Partial:
+    return Partial(
+        pretrain,
+        model=model,
+        trainer=trainer,
+        data=squad,
+        log=default_log,
+        optim=adam_with_cosine_annealing,
+    )
+
+
+@factory(name=NAME, for_task="llm.finetune")
+def finetune_recipe() -> Partial:
+    return Partial(
+        finetune,
+        model=model,
+        trainer=trainer,
+        data=squad,
+        log=default_log,
+        optim=adam_with_cosine_annealing,
+        peft=gpt_lora,
+        resume=hf_resume,
+    )
diff --git a/nemo/collections/llm/recipes/llama3_8b_16k.py b/nemo/collections/llm/recipes/llama3_8b_16k.py
@@ -0,0 +1,45 @@
+import pytorch_lightning as pl
+
+from nemo import lightning as nl
+from nemo.collections.llm.api import pretrain
+from nemo.collections.llm.gpt.data.api import squad
+from nemo.collections.llm.gpt.model.llama import Llama3Config8B, LlamaModel
+from nemo.collections.llm.recipes.log.default import default_log
+from nemo.collections.llm.recipes.optim.adam import adam_with_cosine_annealing
+from nemo.collections.llm.utils import Partial, factory
+
+NAME = "llama3_8b_16k"
+
+
+@factory(name=NAME)
+def model() -> pl.LightningModule:
+    return LlamaModel(Llama3Config8B(seq_length=16384))
+
+
+@factory(name=NAME)
+def trainer(devices=8) -> nl.Trainer:
+    strategy = nl.MegatronStrategy(
+        tensor_model_parallel_size=4,
+        context_parallel_size=2,
+        sequence_parallel=True,
+    )
+
+    return nl.Trainer(
+        devices=devices,
+        max_steps=100,
+        accelerator="gpu",
+        strategy=strategy,
+        plugins=nl.MegatronMixedPrecision(precision="bf16-mixed"),
+    )
+
+
+@factory(name=NAME, for_task="llm.pretrain")
+def pretrain_recipe() -> Partial:
+    return Partial(
+        pretrain,
+        model=model,
+        trainer=trainer,
+        data=squad,
+        log=default_log,
+        optim=adam_with_cosine_annealing,
+    )