NeMo 2.0 vLLM export (#11543)

* Initial commit: non-nemotron vllm export Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com> * Apply isort and black reformatting Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com> * add docstring Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com> * fix gemma Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com> * Apply isort and black reformatting Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com> * update requirements Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com> * Apply isort and black reformatting Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com> * code review + bug fix Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com> * Apply isort and black reformatting Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com> * Bugfix Signed-off-by: Piotr Kamiński <67481570+Laplasjan107@users.noreply.github.com> * code review Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com> * fix typing Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com> * Apply isort and black reformatting Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com> * import torch_dist loader from nemo.export.trtllm Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com> * Apply isort and black reformatting Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com> * handle torch_dist in nemo 1.0 Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com> * Apply isort and black reformatting Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com> * add github ci test Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com> * review + venv install Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com> * Apply isort and black reformatting Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com> * move venv installation Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com> * retry running test Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com> * remove RUN keyword Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com> * modify llama config Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com> * fix conversion script Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com> * add pip install flags Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com> * move vllm venv to separate run step Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com> * try different json format Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com> * add more config overrides to the test Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com> * another approach to config overrides Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com> * fix test llama head_dim Signed-off-by: Piotr Kamiński <67481570+Laplasjan107@users.noreply.github.com> * pin cdifflib down Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com> --------- Signed-off-by: Piotr Kaminski <piotrus.kaminski@gmail.com> Signed-off-by: Laplasjan107 <Laplasjan107@users.noreply.github.com> Signed-off-by: Piotr Kamiński <67481570+Laplasjan107@users.noreply.github.com> Co-authored-by: Laplasjan107 <Laplasjan107@users.noreply.github.com>
NVIDIA · Jan 13, 2025 · e0c97aa · e0c97aa
1 parent 7f3ac6b
commit e0c97aa
Show file tree

Hide file tree

Showing 12 changed files with 238 additions and 51 deletions.
diff --git a/.github/workflows/cicd-main.yml b/.github/workflows/cicd-main.yml
@@ -4915,6 +4915,36 @@ jobs:
       AFTER_SCRIPT: |
         rm -rf /tmp/nemo2_llava_next_results
 
+  L2_NeMo_2_VLLM_EXPORT:
+    needs: [cicd-test-container-setup]
+    uses: ./.github/workflows/_test_template.yml
+    if: contains(fromJSON(needs.cicd-test-container-setup.outputs.test_to_run), 'L2_NeMo_2_VLLM_EXPORT') || needs.cicd-test-container-setup.outputs.all == 'true'
+    with:
+      RUNNER: self-hosted-azure
+      SCRIPT: |
+        python tests/setup/models/create_hf_model.py \
+          --model_name_or_path /home/TestData/nlp/megatron_llama/llama-ci-hf \
+          --output_dir /tmp/llama_head64 \
+          --config_updates "{\"hidden_size\": 512, \"num_attention_heads\": 4, \"numx_hidden_layers\": 2, \"num_key_value_heads\": 4, \"intermediate_size\": 1024, \"head_dim\": 128, \"num_hidden_layers\": 2, \"torch_dtype\": \"float16\" }"
+
+        python tests/collections/llm/test_hf_import.py --hf_model /tmp/llama_head64 --output_path /tmp/nemo2_ckpt
+
+        /opt/venv/bin/python tests/export/nemo_export.py \
+          --min_tps 1 \
+          --max_tps 1 \
+          --use_vllm True \
+          --model_type llama \
+          --max_output_len 128 \
+          --test_deployment True \
+          --model_name nemo2_ckpt \
+          --model_dir /tmp/vllm_from_nemo2 \
+          --checkpoint_dir /tmp/nemo2_ckpt
+
+      AFTER_SCRIPT: |
+        rm -rf /tmp/llama_head64
+        rm -rf /tmp/nemo2_ckpt
+        rm -rf /tmp/vllm_from_nemo2
+
   Nemo_CICD_Test:
     needs:
       - pre-flight
@@ -5102,6 +5132,7 @@ jobs:
       - L2_NeMo_2_LLAVA_NEXT_MOCK_TRAINING
       - L2_HF_Transformer_SFT_FSDP2_2gpu
       - L2_HF_Transformer_SFT_2gpu_nemorun_fsdp2
+      - L2_NeMo_2_VLLM_EXPORT
     if: always()
     runs-on: ubuntu-latest
     steps:

diff --git a/Dockerfile.ci b/Dockerfile.ci
@@ -98,4 +98,11 @@ pip install --no-cache-dir --no-build-isolation ".[all]"
 chmod 777 -R /workspace
 EOF
 
+# Install vLLM in virtualenv
+RUN pip install --no-cache-dir --no-build-isolation virtualenv && \
+  virtualenv /opt/venv && \
+  /opt/venv/bin/pip install --no-cache-dir --no-build-isolation \
+      -r /workspace/requirements/requirements_vllm.txt \
+      -r /workspace/requirements/requirements_infer.txt
+
 ENV PYTHONPATH="${PYTHONPATH}:/workspace/Megatron-LM"
diff --git a/nemo/export/sentencepiece_tokenizer.py b/nemo/export/sentencepiece_tokenizer.py
@@ -29,15 +29,28 @@ class SentencePieceTokenizer:
         special_tokens: either list of special tokens or dictionary of token name to token value
         legacy: when set to True, the previous behavior of the SentecePiece wrapper will be restored,
             including the possibility to add special tokens inside wrapper.
+        tokenizer: wraps an existing tokenizer
     """
 
     def __init__(
-        self, model_path: str, special_tokens: Optional[Union[Dict[str, str], List[str]]] = None, legacy: bool = False
+        self,
+        model_path: Optional[str] = None,
+        special_tokens: Optional[Union[Dict[str, str], List[str]]] = None,
+        legacy: bool = False,
+        tokenizer: Optional[sentencepiece.SentencePieceProcessor] = None,
     ):
-        if not model_path or not os.path.exists(model_path):
-            raise ValueError(f"model_path: {model_path} is invalid")
-        self.tokenizer = sentencepiece.SentencePieceProcessor()
-        self.tokenizer.Load(model_path)
+        model_path_provided = model_path is not None
+        tokenizer_provided = tokenizer is not None
+        if not (model_path_provided ^ tokenizer_provided):
+            raise ValueError("Exactly only one of the arguments 'model_path', 'tokenizer' should be provided")
+
+        if tokenizer_provided:
+            self.tokenizer = tokenizer
+        else:
+            if not model_path or not os.path.exists(model_path):
+                raise ValueError(f"model_path: {model_path} is invalid")
+            self.tokenizer = sentencepiece.SentencePieceProcessor()
+            self.tokenizer.Load(model_path)
 
         self.original_vocab_size = self.tokenizer.get_piece_size()
         self.vocab_size = self.tokenizer.get_piece_size()

diff --git a/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py b/nemo/export/trt_llm/nemo_ckpt_loader/nemo_file.py
@@ -26,7 +26,6 @@
 import torch
 import yaml
 import zarr
-from tensorrt_llm._utils import np_bfloat16, str_dtype_to_torch
 from torch.distributed.checkpoint import FileSystemReader
 from torch.distributed.checkpoint.metadata import BytesStorageMetadata, TensorStorageMetadata
 from torch.distributed.checkpoint.state_dict_loader import load_state_dict
@@ -186,6 +185,8 @@ def load_sharded_metadata_torch_dist(checkpoint_dir: Union[Path, TarPath], torch
     if not torch_tensor:
         for k, v in state_dict.items():
             if v.dtype == torch.bfloat16:
+                from tensorrt_llm._utils import np_bfloat16
+
                 state_dict[k] = v.view(torch.int16).numpy().view(np_bfloat16)
             else:
                 state_dict[k] = v.numpy()
@@ -225,6 +226,8 @@ def load_sharded_metadata_zarr(checkpoint_dir: Union[Path, TarPath], torch_tenso
                 if arr.dtype.name == "bfloat16":
                     sharded_state_dict[key] = torch.from_numpy(arr[:].view(np.int16)).view(torch.bfloat16)
                 else:
+                    from tensorrt_llm._utils import str_dtype_to_torch
+
                     sharded_state_dict[key] = torch.from_numpy(arr[:]).view(str_dtype_to_torch(arr.dtype.name))
             else:
                 sharded_state_dict[key] = arr[:]

diff --git a/nemo/export/utils/__init__.py b/nemo/export/utils/__init__.py
@@ -11,3 +11,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from nemo.export.utils.utils import is_nemo2_checkpoint
+
+__all__ = ["is_nemo2_checkpoint"]
diff --git a/nemo/export/utils/utils.py b/nemo/export/utils/utils.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from pathlib import Path
+
+
+def is_nemo2_checkpoint(checkpoint_path: str) -> bool:
+    """
+    Checks if the checkpoint is in NeMo 2.0 format.
+    Args:
+        checkpoint_path (str): Path to a checkpoint.
+    Returns:
+        bool: True if the path points to a NeMo 2.0 checkpoint; otherwise false.
+    """
+
+    ckpt_path = Path(checkpoint_path)
+    return (ckpt_path / 'context').is_dir()
diff --git a/nemo/export/vllm/engine.py b/nemo/export/vllm/engine.py
@@ -15,6 +15,8 @@
 import logging
 from pathlib import Path
 
+from sentencepiece import SentencePieceProcessor
+from transformers import PreTrainedTokenizerBase
 from vllm import LLMEngine
 from vllm.transformers_utils.tokenizer_group.tokenizer_group import TokenizerGroup
 
@@ -25,15 +27,50 @@
 LOGGER = logging.getLogger("NeMo")
 
 
+class vLLMTokenizerGroup(TokenizerGroup):
+    """
+    Implements a custom tokenizer for vLLM, based on a huggingface tokenizer
+    """
+
+    def __init__(self, tokenizer):
+        self.tokenizer = tokenizer
+
+    def get_lora_tokenizer(self, lora_request):
+        return self.tokenizer
+
+    async def get_lora_tokenizer_async(self, lora_request):
+        return self.tokenizer
+
+    @property
+    def max_input_length(self, lora_request=None):
+        return None
+
+
 class NemoLLMEngine(LLMEngine):
     """
     Overrides some functionality from vllm.LLMEngine to use our custom tokenizer
     instead of one from Transformers.
     """
 
     def _init_tokenizer(self, **tokenizer_init_kwargs):
-        # Find the tokenizer file name in the Nemo checkpoint config
+        # Determine if the model needs a bos token (which is not stored in Nemo checkpoints)
+        add_bos_token = self.model_config.model_converter.requires_bos_token()
         tokenizer_config = self.model_config.nemo_model_config.get('tokenizer', {})
+
+        if not isinstance(tokenizer_config, dict) and hasattr(tokenizer_config, 'tokenizer'):
+            tokenizer = tokenizer_config.tokenizer
+
+            if isinstance(tokenizer, SentencePieceProcessor):
+                self.model_config.hf_config.bos_token_id = tokenizer.bos_token_id
+                self.model_config.hf_config.eos_token_id = tokenizer.eos_token_id
+
+                tokenizer = SentencePieceTokenizer(tokenizer=tokenizer)
+                return NemoTokenizerGroup(tokenizer, add_bos_token=add_bos_token)
+
+            if isinstance(tokenizer, PreTrainedTokenizerBase):
+                return vLLMTokenizerGroup(tokenizer)
+
+        # Find the tokenizer file name in the Nemo checkpoint config
         tokenizer_model = tokenizer_config.get('model', tokenizer_config.get('tokenizer_model', None))
 
         # If there is no tokenizer file specified but there's a reference to an HF tokenizer, use that

diff --git a/nemo/export/vllm/model_config.py b/nemo/export/vllm/model_config.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from pathlib import Path
 from typing import Any, Dict, Optional, Union
 
 import torch
@@ -21,6 +22,7 @@
 from vllm.transformers_utils.config import get_hf_text_config
 
 from nemo.export.tarutils import TarPath
+from nemo.export.utils import is_nemo2_checkpoint
 from nemo.export.vllm.model_converters import get_model_converter
 
 
@@ -88,39 +90,32 @@ def __init__(
         if self.model_converter is None:
             raise RuntimeError(f'Unknown model type "{model_type}"')
 
-        hf_to_nemo_dict = {
-            'hidden_size': 'hidden_size',
-            'intermediate_size': 'ffn_hidden_size',
-            'num_hidden_layers': 'num_layers',
-            'num_attention_heads': 'num_attention_heads',
-            'num_key_value_heads': 'num_query_groups',
-            # 'hidden_act': 'activation', ## <- vLLM has good defaults for the models, nemo values are wrong
-            'max_position_embeddings': ['max_position_embeddings', 'encoder_seq_length'],
-            'rms_norm_eps': 'layernorm_epsilon',
-            'attention_dropout': 'attention_dropout',
-            'initializer_range': 'init_method_std',
-            'norm_epsilon': 'layernorm_epsilon',
-            'rope_theta': 'rotary_base',
-            'use_bias': 'bias',
-        }
+        if is_nemo2_checkpoint(nemo_checkpoint):
+            from nemo.lightning.io import load_context
 
-        with TarPath(nemo_checkpoint) as archive:
-            with (archive / "model_config.yaml").open("r") as model_config_file:
-                self.nemo_model_config = yaml.load(model_config_file, Loader=yaml.SafeLoader)
+            nemo_checkpoint: Path = Path(nemo_checkpoint)
 
-                hf_args = {}
-                for hf_arg, nemo_arg in hf_to_nemo_dict.items():
-                    if not isinstance(nemo_arg, list):
-                        nemo_arg = [nemo_arg]
+            with (nemo_checkpoint / "context/model.yaml").open('r') as config_file:
+                self.nemo_model_config: dict = yaml.load(config_file, Loader=yaml.SafeLoader)
 
-                    for nemo_arg_option in nemo_arg:
-                        value = self.nemo_model_config.get(nemo_arg_option)
-                        if value is not None:
-                            hf_args[hf_arg] = value
-                            break
+            hf_args = self._load_hf_arguments(self.nemo_model_config['config'])
+            tokenizer = load_context((nemo_checkpoint / "context"), subpath="model.tokenizer")
 
-                self.model_converter.convert_config(self.nemo_model_config, hf_args)
+            if hasattr(tokenizer, 'bos_id'):
+                tokenizer.tokenizer.bos_token_id = tokenizer.bos_id
+            if hasattr(tokenizer, 'eos_id'):
+                tokenizer.tokenizer.eos_token_id = tokenizer.eos_id
 
+            hf_args['vocab_size'] = tokenizer.original_vocab_size
+            self.model_converter.convert_config(self.nemo_model_config['config'], hf_args)
+            self.hf_config = AutoConfig.for_model(model_type, **hf_args)
+            self.nemo_model_config['tokenizer'] = tokenizer
+        else:
+            with TarPath(nemo_checkpoint) as archive:
+                with (archive / "model_config.yaml").open("r") as model_config_file:
+                    self.nemo_model_config = yaml.load(model_config_file, Loader=yaml.SafeLoader)
+                    hf_args = self._load_hf_arguments(self.nemo_model_config)
+                    self.model_converter.convert_config(self.nemo_model_config, hf_args)
                 self.hf_config = AutoConfig.for_model(model_type, **hf_args)
 
         self.hf_config.architectures = [self.model_converter.get_architecture()]
@@ -142,3 +137,37 @@ def __init__(
         self._verify_embedding_mode()
         self._verify_quantization()
         self._verify_cuda_graph()
+
+    def _load_hf_arguments(self, nemo_config: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Maps argument names used in NeMo to their corresponding names in HF.
+        """
+
+        hf_to_nemo_dict = {
+            'hidden_size': 'hidden_size',
+            'intermediate_size': 'ffn_hidden_size',
+            'num_hidden_layers': 'num_layers',
+            'num_attention_heads': 'num_attention_heads',
+            'num_key_value_heads': 'num_query_groups',
+            # 'hidden_act': 'activation', ## <- vLLM has good defaults for the models, nemo values are wrong
+            'max_position_embeddings': ['max_position_embeddings', 'encoder_seq_length'],
+            'rms_norm_eps': 'layernorm_epsilon',
+            'attention_dropout': 'attention_dropout',
+            'initializer_range': 'init_method_std',
+            'norm_epsilon': 'layernorm_epsilon',
+            'rope_theta': 'rotary_base',
+            'use_bias': ['bias', 'add_bias_linear'],
+        }
+
+        hf_args = {}
+        for hf_arg, nemo_arg in hf_to_nemo_dict.items():
+            if not isinstance(nemo_arg, list):
+                nemo_arg = [nemo_arg]
+
+            for nemo_arg_option in nemo_arg:
+                value = nemo_config.get(nemo_arg_option)
+                if value is not None:
+                    hf_args[hf_arg] = value
+                    break
+
+        return hf_args