From 9799051a8047e1fe8d1f5fca723afc932b0e257e Mon Sep 17 00:00:00 2001
From: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Date: Fri, 10 Jan 2025 19:51:21 +0300
Subject: [PATCH] Lora ckpt in HF format for NeMo AutoModel (#11712)

* Save lora ckpt in safetensor and a config

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* remove hf variable from peft

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* vllm with automodel peft working

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

* revert changes

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* update examples

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

* removed unused import

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* enable ckpt saving

Signed-off-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>

* remove unused import

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

* Apply isort and black reformatting

Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>

* fix minor bug

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>

---------

Signed-off-by: Onur Yilmaz <oyilmaz@nvidia.com>
Signed-off-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
Signed-off-by: Onur Yilmaz <35306097+oyilmaz-nvidia@users.noreply.github.com>
Co-authored-by: oyilmaz-nvidia <oyilmaz-nvidia@users.noreply.github.com>
---
 .github/workflows/cicd-main.yml               | 13 +--
 examples/llm/peft/hf.py                       | 14 ++-
 examples/llm/peft/hf_vllm.py                  | 42 ++++++++
 nemo/export/__init__.py                       |  7 ++
 nemo/export/vllm_hf_exporter.py               | 24 ++++-
 nemo/lightning/io/pl.py                       | 97 +++++++++++++++++++
 nemo/lightning/pytorch/callbacks/peft.py      | 64 +++++++++---
 nemo/lightning/pytorch/strategies/utils.py    | 15 ++-
 .../llm/hf/{peft.py => peft_hf.py}            |  1 -
 .../vlm/hf/{peft.py => peft_hf.py}            |  0
 10 files changed, 246 insertions(+), 31 deletions(-)
 mode change 100644 => 100755 examples/llm/peft/hf.py
 create mode 100755 examples/llm/peft/hf_vllm.py
 mode change 100644 => 100755 nemo/lightning/pytorch/strategies/utils.py
 rename tests/collections/llm/hf/{peft.py => peft_hf.py} (99%)
 rename tests/collections/vlm/hf/{peft.py => peft_hf.py} (100%)

diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
index 6b2470791a86..0ebeed09535a 100644
--- a/.github/workflows/cicd-main.yml
+++ b/.github/workflows/cicd-main.yml
@@ -3622,7 +3622,7 @@ jobs:
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
-        TRANSFORMERS_OFFLINE=1 python tests/collections/vlm/hf/peft.py --model /home/TestData/vlm/qwen2-2b/ --max-steps 3 --disable-ckpt
+        TRANSFORMERS_OFFLINE=1 python tests/collections/vlm/hf/peft_hf.py --model /home/TestData/vlm/qwen2-2b/ --max-steps 3
       AFTER_SCRIPT: |
         rm -rf nemo_experiments
 
@@ -3633,8 +3633,9 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        TRANSFORMERS_OFFLINE=1 python tests/collections/vlm/hf/peft.py --model /home/TestData/vlm/qwen2-2b/ --max-steps 3 --disable-ckpt --strategy fsdp --devices 2
-
+        TRANSFORMERS_OFFLINE=1 python tests/collections/vlm/hf/peft_hf.py --model /home/TestData/vlm/qwen2-2b/ --max-steps 3 --strategy fsdp --devices 2
+      AFTER_SCRIPT: |
+        rm -rf nemo_experiments
   L2_VLM_HF_Transformer_PEFT_4bit:
     needs: [ cicd-test-container-setup ]
     uses: ./.github/workflows/_test_template.yml
@@ -3642,7 +3643,7 @@ jobs:
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
-        TRANSFORMERS_OFFLINE=1 python tests/collections/vlm/hf/peft.py --model /home/TestData/vlm/qwen2-2b/ --max-steps 3 --disable-ckpt --use-4bit
+        TRANSFORMERS_OFFLINE=1 python tests/collections/vlm/hf/peft_hf.py --model /home/TestData/vlm/qwen2-2b/ --max-steps 3 --use-4bit
       AFTER_SCRIPT: |
         rm -rf nemo_experiments
 
@@ -3653,7 +3654,7 @@ jobs:
     with:
       RUNNER: self-hosted-azure-gpus-1
       SCRIPT: |
-        TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/peft.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10 --disable-ckpt
+        TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/peft_hf.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10
       AFTER_SCRIPT: |
         rm -rf nemo_experiments
 
@@ -3675,7 +3676,7 @@ jobs:
     with:
       RUNNER: self-hosted-azure
       SCRIPT: |
-        TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/peft.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10 --devices 2 --strategy ddp --disable-ckpt
+        TRANSFORMERS_OFFLINE=1 python tests/collections/llm/hf/peft_hf.py --model /home/TestData/nlp/hf_gemma/hf_gemma_2b --max-steps 10 --devices 2 --strategy ddp --disable-ckpt
       AFTER_SCRIPT: |
         rm -rf nemo_experiments
 
diff --git a/examples/llm/peft/hf.py b/examples/llm/peft/hf.py
old mode 100644
new mode 100755
index c0562663c2cc..1b3776005b02
--- a/examples/llm/peft/hf.py
+++ b/examples/llm/peft/hf.py
@@ -14,8 +14,10 @@
 
 import fiddle as fdl
 from lightning.pytorch.loggers import WandbLogger
+
 from nemo import lightning as nl
 from nemo.collections import llm
+from nemo.lightning import NeMoLogger
 from nemo.lightning.pytorch.callbacks import JitConfig, JitTransform
 
 
@@ -69,6 +71,7 @@ def main():
     parser.add_argument('--max-steps', type=int, default=100)
     parser.add_argument('--wandb-project', type=str, default=None)
     parser.add_argument('--use-torch-jit', action='store_true')
+    parser.add_argument('--ckpt-folder', type=str, default=None)
     args = parser.parse_args()
 
     wandb = None
@@ -84,6 +87,13 @@ def main():
         # https://github.com/Lightning-AI/pytorch-lightning/blob/8ad3e29816a63d8ce5c00ac104b14729a4176f4f/src/lightning/pytorch/plugins/precision/fsdp.py#L81
         grad_clip = None
     use_dist_samp = False
+
+    import tempfile
+
+    if args.ckpt_folder is None:
+        args.ckpt_folder = tempfile.TemporaryDirectory().name
+        print("Temp directory created for base model: ", args.ckpt_folder)
+
     tokenizer = llm.HFAutoModelForCausalLM.configure_tokenizer(args.model)
 
     callbacks = []
@@ -110,10 +120,10 @@ def main():
             precision="bf16",
         ),
         optim=fdl.build(llm.adam.pytorch_adam_with_flat_lr(lr=1e-5)),
-        log=None,
+        log=NeMoLogger(log_dir=args.ckpt_folder, use_datetime_version=False),
         peft=llm.peft.LoRA(
             target_modules=['*_proj'],
-            dim=32,
+            dim=8,
         ),
     )
 
diff --git a/examples/llm/peft/hf_vllm.py b/examples/llm/peft/hf_vllm.py
new file mode 100755
index 000000000000..9d0b4202607e
--- /dev/null
+++ b/examples/llm/peft/hf_vllm.py
@@ -0,0 +1,42 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+try:
+    from nemo.export.vllm_hf_exporter import vLLMHFExporter
+except Exception:
+    raise Exception(
+        "vLLM should be installed in the environment or import "
+        "the vLLM environment in the NeMo FW container using "
+        "source /opt/venv/bin/activate command"
+    )
+
+
+if __name__ == '__main__':
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', required=True, type=str, help="Local path of the base model")
+    parser.add_argument('--lora-model', required=True, type=str, help="Local path of the lora model")
+    # parser.add_argument('--triton-model-name', required=True, type=str, help="Name for the service")
+    args = parser.parse_args()
+
+    lora_model_name = "lora_model"
+
+    exporter = vLLMHFExporter()
+    exporter.export(model=args.model, enable_lora=True)
+    exporter.add_lora_models(lora_model_name=lora_model_name, lora_model=args.lora_model)
+
+    print(
+        "------------- Output: ", exporter.forward(input_texts=["How are you doing?"], lora_model_name=lora_model_name)
+    )
diff --git a/nemo/export/__init__.py b/nemo/export/__init__.py
index d9155f923f18..100d2dcca63b 100644
--- a/nemo/export/__init__.py
+++ b/nemo/export/__init__.py
@@ -11,3 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+
+use_tensorrt = True
+try:
+    from nemo.export.tensorrt_lazy_compiler import trt_compile
+except Exception as e:
+    use_tensorrt = False
diff --git a/nemo/export/vllm_hf_exporter.py b/nemo/export/vllm_hf_exporter.py
index 2d9754e08767..5a51e0d0b298 100755
--- a/nemo/export/vllm_hf_exporter.py
+++ b/nemo/export/vllm_hf_exporter.py
@@ -19,6 +19,7 @@
 from pytriton.decorators import batch
 from pytriton.model_config import Tensor
 from vllm import LLM, SamplingParams
+from vllm.lora.request import LoRARequest
 
 from nemo.deploy import ITritonDeployable
 from nemo.deploy.utils import cast_output, str_ndarray2list
@@ -48,14 +49,20 @@ class vLLMHFExporter(ITritonDeployable):
 
     def __init__(self):
         self.model = None
+        self.lora_models = None
 
-    def export(self, model):
+    def export(self, model, enable_lora: bool = False):
         """
         Exports the HF checkpoint to vLLM and initializes the engine.
         Args:
             model (str): model name or the path
         """
-        self.model = LLM(model=model)
+        self.model = LLM(model=model, enable_lora=enable_lora)
+
+    def add_lora_models(self, lora_model_name, lora_model):
+        if self.lora_models is None:
+            self.lora_models = {}
+        self.lora_models[lora_model_name] = lora_model
 
     @property
     def get_triton_input(self):
@@ -99,15 +106,24 @@ def forward(
         input_texts: List[str],
         max_output_len: int = 64,
         top_k: int = 1,
-        top_p: float = 0.0,
+        top_p: float = 0.1,
         temperature: float = 1.0,
+        lora_model_name: str = None,
     ):
         assert self.model is not None, "Model is not initialized."
 
+        lora_request = None
+        if lora_model_name is not None:
+            if self.lora_models is None:
+                raise Exception("No lora models are available.")
+            assert lora_model_name in self.lora_models.keys(), "Lora model was not added before"
+            lora_request = LoRARequest(lora_model_name, 1, self.lora_models[lora_model_name])
+
         sampling_params = SamplingParams(
             max_tokens=max_output_len, temperature=temperature, top_k=int(top_k), top_p=top_p
         )
-        request_output = self.model.generate(input_texts, sampling_params)
+
+        request_output = self.model.generate(input_texts, sampling_params, lora_request=lora_request)
         output = []
         for o in request_output:
             output.append(o.outputs[0].text)
diff --git a/nemo/lightning/io/pl.py b/nemo/lightning/io/pl.py
index f2c70034fd50..7c4a8787ced1 100644
--- a/nemo/lightning/io/pl.py
+++ b/nemo/lightning/io/pl.py
@@ -331,6 +331,103 @@ def should_remove_missing_sharded_base(x: Any):
         return sharded_state_dict
 
 
+class HuggingFaceCheckpointIO(AsyncCompatibleCheckpointIO, IOMixin):
+    """CheckpointIO that utilizes :func:`torch.save` and :func:`torch.load` to save and load checkpoints respectively,
+    common for most use cases.
+
+    .. warning::  This is an :ref:`experimental <versioning:Experimental API>` feature.
+
+    """
+
+    def __init__(self, hf_model=None, lora=False):
+        self.hf_model = hf_model
+        self.lora = lora
+
+    @override
+    def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_options: Optional[Any] = None) -> None:
+        """Save model/training states as a checkpoint file through state-dump and file-write.
+
+        Args:
+            checkpoint: dict containing model and trainer state
+            path: write-target path
+            storage_options: not used in ``TorchCheckpointIO.save_checkpoint``
+
+        Raises
+        ------
+            TypeError:
+                If ``storage_options`` arg is passed in
+
+        """
+
+        if self.lora:
+            from safetensors.torch import save_file
+
+            state_dict = {}
+            for module_name, module_weight in checkpoint["state_dict"].items():
+                new_module_name = module_name.replace("model.model", "base_model.model")
+                new_module_name = new_module_name.replace("lora_a", "lora_A.weight").replace("lora_b", "lora_B.weight")
+                state_dict[new_module_name] = module_weight
+
+            checkpoint_dir = ckpt_to_weights_subdir(path, is_saving=True)
+            fs = get_filesystem(checkpoint_dir)
+            fs.makedirs(checkpoint_dir, exist_ok=True)
+            save_file(state_dict, checkpoint_dir / "adapter_model.safetensors")
+
+    @override
+    def load_checkpoint(
+        self,
+        path: _PATH,
+        sharded_state_dict=None,
+        map_location: Optional[Callable] = None,
+        strict: Optional['StrictHandling'] | bool = None,
+    ) -> Dict[str, Any]:
+        """Loads checkpoint using :func:`torch.load`, with additional handling for ``fsspec`` remote loading of files.
+
+        Args:
+            path: Path to checkpoint
+            map_location: a function, :class:`torch.device`, string or a dict specifying how to remap storage
+                locations.
+
+        Returns: The loaded checkpoint.
+
+        Raises
+        ------
+            FileNotFoundError: If ``path`` is not found by the ``fsspec`` filesystem
+
+        """
+
+        # Try to read the checkpoint at `path`. If not exist, do not restore checkpoint.
+        fs = get_filesystem(path)
+        if not fs.exists(path):
+            raise FileNotFoundError(f"Checkpoint file not found: {path}")
+        if not fs.isdir(path):
+            raise ValueError(f"Checkpoints should be a directory. Found: {path}.")
+
+        state_dict = None
+        if (path / "adaptor_config.json").exists():
+            from safetensors import safe_open
+
+            state_dict = {}
+            with safe_open("adapter_model.safetensors", framework="pt", device=0) as f:
+                for k in f.keys():
+                    state_dict[k] = f.get_tensor(k)
+
+        return {'state_dict': state_dict}
+
+    @override
+    def remove_checkpoint(self, path: _PATH) -> None:
+        """Remove checkpoint file from the filesystem.
+
+        Args:
+            path: Path to checkpoint
+
+        """
+        fs = get_filesystem(path)
+        if fs.exists(path):
+            fs.rm(path, recursive=True)
+            log.debug(f"Removed checkpoint: {path}")
+
+
 def _fix_tensors_device(ckpt: Dict) -> Dict:
     """Ensure checkpoint tensors are on the correct device."""
     assert torch.cuda.is_initialized(), (torch.cuda.is_available(), torch.cuda.is_initialized())
diff --git a/nemo/lightning/pytorch/callbacks/peft.py b/nemo/lightning/pytorch/callbacks/peft.py
index 0c559d1b3990..a71d6792d457 100644
--- a/nemo/lightning/pytorch/callbacks/peft.py
+++ b/nemo/lightning/pytorch/callbacks/peft.py
@@ -142,20 +142,27 @@ def setup(self, trainer: pl.Trainer, pl_module: pl.LightningModule, stage: str)
         trainer.strategy.trainer = trainer
         wrapped_io = partial(WrappedAdapterIO, peft=self)
 
-        ckpt_io_kwarg_names = [
-            "save_ckpt_format",
-            "async_save",
-            "torch_dist_multiproc",
-            "assume_constant_structure",
-            "parallel_save",
-            "parallel_save_within_dp",
-            "parallel_load",
-            "load_directly_on_device",
-        ]
-        ckpt_io_kwargs = {
-            arg: getattr(trainer.strategy, arg)
-            for arg in filter(lambda x: hasattr(trainer.strategy, x), ckpt_io_kwarg_names)
-        }
+        is_hf_model = getattr(trainer.model, "is_hf_model", False)
+        if not type(is_hf_model) == type(True):
+            is_hf_model = False
+
+        if is_hf_model:
+            ckpt_io_kwargs = {"model_library": "huggingface", "lora": True}
+        else:
+            ckpt_io_kwarg_names = [
+                "save_ckpt_format",
+                "async_save",
+                "torch_dist_multiproc",
+                "assume_constant_structure",
+                "parallel_save",
+                "parallel_save_within_dp",
+                "parallel_load",
+                "load_directly_on_device",
+            ]
+            ckpt_io_kwargs = {
+                arg: getattr(trainer.strategy, arg)
+                for arg in filter(lambda x: hasattr(trainer.strategy, x), ckpt_io_kwarg_names)
+            }
         trainer.strategy._checkpoint_io = create_checkpoint_io(wrapping_ckpt_io=wrapped_io, **ckpt_io_kwargs)
         self.wrapped_io = (
             trainer.strategy._checkpoint_io._checkpoint_io
@@ -401,14 +408,39 @@ def save_checkpoint(self, checkpoint: Dict[str, Any], path: _PATH, storage_optio
         from nemo.utils.get_rank import is_global_rank_zero
 
         if is_global_rank_zero():
-            metadata = {"model_ckpt_path": str(self.model_ckpt_path)}
             base_dir = ckpt_to_weights_subdir(path, is_saving=True)
             base_dir.mkdir(parents=True, exist_ok=True)
-            adapter_meta_path = base_dir / ADAPTER_META_FILENAME
+
+            from nemo.lightning.io.pl import HuggingFaceCheckpointIO
+
+            if isinstance(self.checkpoint_io, HuggingFaceCheckpointIO):
+                metadata = self._create_lora_hf_config()
+                adapter_meta_path = base_dir / "adapter_config.json"
+            else:
+                metadata = {"model_ckpt_path": str(self.model_ckpt_path)}
+                adapter_meta_path = base_dir / ADAPTER_META_FILENAME
+
             with open(adapter_meta_path, "w") as f:
                 json.dump(metadata, f)
         return request
 
+    def _create_lora_hf_config(self):
+        from peft import LoraConfig
+        from nemo.collections.llm.peft import DoRA
+
+        lora_config = LoraConfig(
+            r=self.peft.dim,
+            target_modules=self.peft.target_modules,
+            lora_alpha=self.peft.alpha,
+            lora_dropout=self.peft.dropout,
+            use_dora=isinstance(self.peft, DoRA),
+        )
+        lora_config = lora_config.to_dict()
+        lora_config["peft_type"] = "LORA"
+        lora_config["megatron_core"] = None
+        lora_config["target_modules"] = self.peft.target_modules
+        return lora_config
+
     @override
     def load_checkpoint(
         self,
diff --git a/nemo/lightning/pytorch/strategies/utils.py b/nemo/lightning/pytorch/strategies/utils.py
old mode 100644
new mode 100755
index 51e4a7dbfa19..d2b4a885f817
--- a/nemo/lightning/pytorch/strategies/utils.py
+++ b/nemo/lightning/pytorch/strategies/utils.py
@@ -32,7 +32,6 @@
 from torch.distributed.device_mesh import DeviceMesh
 
 from nemo.lightning import _strategy_lib
-from nemo.lightning.io.pl import MegatronCheckpointIO
 from nemo.lightning.pytorch.callbacks import MegatronProgressBar, ProgressPrinter
 from nemo.utils.callbacks.dist_ckpt_io import AsyncFinalizableCheckpointIO
 
@@ -119,7 +118,19 @@ def ckpt_to_dir(filepath: Union[str, Path]) -> Path:
 
 
 def create_checkpoint_io(wrapping_ckpt_io=None, **kwargs):
-    checkpoint_io = MegatronCheckpointIO(**kwargs)
+    model_library = "megatron"
+    if "model_library" in kwargs.keys():
+        model_library = kwargs["model_library"]
+
+    if model_library == "huggingface":
+        from nemo.lightning.io.pl import HuggingFaceCheckpointIO
+
+        checkpoint_io = HuggingFaceCheckpointIO(lora=kwargs["lora"])
+    else:
+        from nemo.lightning.io.pl import MegatronCheckpointIO
+
+        checkpoint_io = MegatronCheckpointIO(**kwargs)
+
     if wrapping_ckpt_io:
         checkpoint_io = wrapping_ckpt_io(checkpoint_io)
     if kwargs.get("async_save", False):
diff --git a/tests/collections/llm/hf/peft.py b/tests/collections/llm/hf/peft_hf.py
similarity index 99%
rename from tests/collections/llm/hf/peft.py
rename to tests/collections/llm/hf/peft_hf.py
index 3be0443d69fe..7a86d5f9c0ba 100644
--- a/tests/collections/llm/hf/peft.py
+++ b/tests/collections/llm/hf/peft_hf.py
@@ -46,7 +46,6 @@ def formatting_prompts_func(examples):
 
     tokenizer = getattr(tokenizer, 'tokenizer', tokenizer)
     datamodule = llm.HFDatasetDataModule(data_path, split="train[:100]", pad_token_id=tokenizer.eos_token_id)
-
     datamodule.map(
         formatting_prompts_func,
         batched=False,
diff --git a/tests/collections/vlm/hf/peft.py b/tests/collections/vlm/hf/peft_hf.py
similarity index 100%
rename from tests/collections/vlm/hf/peft.py
rename to tests/collections/vlm/hf/peft_hf.py