Merge branch 'main' into opt_lora_dropout

guyueh1 · Aug 28, 2024 · 0b56ac9 · 0b56ac9
2 parents 15e3844 + e68f981
commit 0b56ac9
Show file tree

Hide file tree

Showing 20 changed files with 1,057 additions and 68 deletions.
diff --git a/docs/source/multimodal/text2img/sd.rst b/docs/source/multimodal/text2img/sd.rst
@@ -163,7 +163,7 @@ Optimization related configurations
 Training with precached latents
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Since the VAE and text encoder remain frozed during training, you can pre-calculate the image and caption latents offline, enhancing training throughput. To create a pre-cached dataset, see :doc:`Multimodal Dataset <./datasets>`. For training using this dataset, configure ``model.data`` section properly and set ``model.first_stage_key=image_encoded`` along with ``model.cond_stage_key=captions_encoded``.
+Since the VAE and text encoder remain frozen during training, you can pre-calculate the image and caption latents offline, enhancing training throughput. To create a pre-cached dataset, see :doc:`Multimodal Dataset <./datasets>`. For training using this dataset, configure ``model.data`` section properly and set ``model.first_stage_key=image_encoded`` along with ``model.cond_stage_key=captions_encoded``.
 
 Reference
 -----------

diff --git a/examples/nlp/duplex_text_normalization/data/en/data_preprocessing.py b/examples/nlp/duplex_text_normalization/data/en/data_preprocessing.py
@@ -46,8 +46,8 @@
 
 import os
 from argparse import ArgumentParser
+from functools import cache
 
-import inflect
 import regex as re
 from tqdm import tqdm
 
@@ -60,12 +60,21 @@
 )
 from nemo.utils import logging
 
-engine = inflect.engine()
+
+@cache
+def inflect_engine():
+    import inflect
+
+    return inflect.engine()
+
 
 # these are all words that can appear in a verbalized number, this list will be used later as a filter to detect numbers in verbalizations
 number_verbalizations = list(range(0, 20)) + list(range(20, 100, 10))
 number_verbalizations = (
-    [engine.number_to_words(x, zero="zero").replace("-", " ").replace(",", "") for x in number_verbalizations]
+    [
+        inflect_engine().number_to_words(x, zero="zero").replace("-", " ").replace(",", "")
+        for x in number_verbalizations
+    ]
     + ["hundred", "thousand", "million", "billion", "trillion"]
     + ["point"]
 )
@@ -85,7 +94,7 @@ def process_url(o):
     """
 
     def flatten(l):
-        """ flatten a list of lists """
+        """flatten a list of lists"""
         return [item for sublist in l for item in sublist]
 
     if o != '<self>' and '_letter' in o:
@@ -129,6 +138,7 @@ def convert2digits(digits: str):
     Return:
         res: number verbalization of the integer prefix of the input
     """
+    engine = inflect_engine()
     res = []
     for i, x in enumerate(digits):
         if x in digit:
@@ -145,6 +155,7 @@ def convert2digits(digits: str):
 
 
 def convert(example):
+    engine = inflect_engine()
     cls, written, spoken = example
 
     written = convert_fraction(written)
@@ -288,7 +299,7 @@ def convert(example):
 def ignore(example):
     """
     This function makes sure specific class types like 'PLAIN', 'ELECTRONIC' etc. are left unchanged.
-    
+
     Args:
         example: data example
     """
@@ -300,7 +311,7 @@ def ignore(example):
 
 
 def process_file(fp):
-    """ Reading the raw data from a file of NeMo format and preprocesses it. Write is out to the output directory.
+    """Reading the raw data from a file of NeMo format and preprocesses it. Write is out to the output directory.
     For more info about the data format, refer to the
     `text_normalization doc <https://github.com/NVIDIA/NeMo/blob/main/docs/source/nlp/text_normalization.rst>`.
 

diff --git a/nemo/collections/common/parts/preprocessing/cleaners.py b/nemo/collections/common/parts/preprocessing/cleaners.py
@@ -14,7 +14,6 @@
 
 import re
 
-import inflect
 from text_unidecode import unidecode
 
 from nemo.utils import logging
@@ -139,7 +138,14 @@
 ]
 
 
-inflect = inflect.engine()
+from functools import cache
+
+
+@cache
+def inflect_engine():
+    import inflect
+
+    return inflect.engine()
 
 
 def clean_text(string, table, punctuation_to_replace, abbreviation_version=None):
@@ -194,11 +200,12 @@ def reset(self):
         self.currency = None
 
     def format_final_number(self, whole_num, decimal):
+        inflect = inflect_engine()
         if self.currency:
             return_string = inflect.number_to_words(whole_num)
             return_string += " dollar" if whole_num == 1 else " dollars"
             if decimal:
-                return_string += " and " + inflect.number_to_words(decimal)
+                return_string += " and " + inflect_engine().number_to_words(decimal)
                 return_string += " cent" if whole_num == decimal else " cents"
             self.reset()
             return return_string
@@ -210,11 +217,12 @@ def format_final_number(self, whole_num, decimal):
         else:
             # Check if there are non-numbers
             def convert_to_word(match):
-                return " " + inflect.number_to_words(match.group(0)) + " "
+                return " " + inflect_engine().number_to_words(match.group(0)) + " "
 
             return re.sub(r'[0-9,]+', convert_to_word, whole_num)
 
     def clean(self, match):
+        inflect = inflect_engine()
         ws = match.group(2)
         number = match.group(3)
         _proceeding_symbol = match.group(7)

diff --git a/nemo/collections/common/tokenizers/en_ja_tokenizers.py b/nemo/collections/common/tokenizers/en_ja_tokenizers.py
@@ -14,9 +14,6 @@
 import re
 from typing import List
 
-from pangu import spacing
-from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer
-
 try:
     import ipadic
     import MeCab
@@ -36,6 +33,8 @@ class EnJaProcessor:
     """
 
     def __init__(self, lang_id: str):
+        from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer
+
         self.lang_id = lang_id
         self.moses_tokenizer = MosesTokenizer(lang=lang_id)
         self.moses_detokenizer = MosesDetokenizer(lang=lang_id)
@@ -81,6 +80,8 @@ def __init__(self):
         self.mecab_tokenizer = MeCab.Tagger(ipadic.MECAB_ARGS + " -Owakati")
 
     def detokenize(self, text: List[str]) -> str:
+        from pangu import spacing
+
         RE_WS_IN_FW = re.compile(
             r'([\u2018\u2019\u201c\u201d\u2e80-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff00-\uffef])\s+(?=[\u2018\u2019\u201c\u201d\u2e80-\u312f\u3200-\u32ff\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff00-\uffef])'
         )

diff --git a/nemo/collections/common/tokenizers/indic_tokenizers.py b/nemo/collections/common/tokenizers/indic_tokenizers.py
@@ -14,8 +14,6 @@
 
 from typing import List
 
-from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer
-
 
 class IndicProcessor:
     """
@@ -26,6 +24,8 @@ class IndicProcessor:
     def __init__(self, lang_id: str):
         if lang_id != 'hi':
             raise NotImplementedError
+        from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer
+
         self.moses_tokenizer = MosesTokenizer(lang=lang_id)
         self.moses_detokenizer = MosesDetokenizer(lang=lang_id)
         self.normalizer = MosesPunctNormalizer(lang=lang_id)

diff --git a/nemo/collections/common/tokenizers/moses_tokenizers.py b/nemo/collections/common/tokenizers/moses_tokenizers.py
@@ -14,15 +14,15 @@
 
 from typing import List
 
-from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer
-
 
 class MosesProcessor:
     """
     Tokenizer, Detokenizer and Normalizer utilities in Moses
     """
 
     def __init__(self, lang_id: str):
+        from sacremoses import MosesDetokenizer, MosesPunctNormalizer, MosesTokenizer
+
         self.moses_tokenizer = MosesTokenizer(lang=lang_id)
         self.moses_detokenizer = MosesDetokenizer(lang=lang_id)
         self.normalizer = MosesPunctNormalizer(lang=lang_id)

diff --git a/nemo/collections/llm/__init__.py b/nemo/collections/llm/__init__.py
@@ -39,6 +39,9 @@
     Llama2Config70B,
     Llama3Config8B,
     Llama3Config70B,
+    Llama31Config8B,
+    Llama31Config70B,
+    Llama31Config405B,
     LlamaConfig,
     LlamaModel,
     MaskedTokenLossReduction,
@@ -55,6 +58,12 @@
     Nemotron4Config340B,
     NemotronConfig,
     NemotronModel,
+    Qwen2Config,
+    Qwen2Config1P5B,
+    Qwen2Config7B,
+    Qwen2Config72B,
+    Qwen2Config500M,
+    Qwen2Model,
     gpt_data_step,
     gpt_forward_step,
 )
@@ -93,6 +102,9 @@
     "Llama2Config70B",
     "Llama3Config8B",
     "Llama3Config70B",
+    "Llama31Config8B",
+    "Llama31Config70B",
+    "Llama31Config405B",
     "CodeLlamaConfig7B",
     "CodeLlamaConfig13B",
     "CodeLlamaConfig34B",
@@ -111,6 +123,12 @@
     "ChatGLM2Config6B",
     "ChatGLM3Config6B",
     "ChatGLMModel",
+    "Qwen2Model",
+    "Qwen2Config7B",
+    "Qwen2Config",
+    "Qwen2Config500M",
+    "Qwen2Config1P5B",
+    "Qwen2Config72B",
     "PreTrainingDataModule",
     "FineTuningDataModule",
     "SquadDataModule",

diff --git a/nemo/collections/llm/gpt/model/__init__.py b/nemo/collections/llm/gpt/model/__init__.py
@@ -27,6 +27,9 @@
     Llama2Config70B,
     Llama3Config8B,
     Llama3Config70B,
+    Llama31Config8B,
+    Llama31Config70B,
+    Llama31Config405B,
     LlamaConfig,
     LlamaModel,
 )
@@ -46,6 +49,14 @@
     NemotronConfig,
     NemotronModel,
 )
+from nemo.collections.llm.gpt.model.qwen2 import (
+    Qwen2Config,
+    Qwen2Config1P5B,
+    Qwen2Config7B,
+    Qwen2Config72B,
+    Qwen2Config500M,
+    Qwen2Model,
+)
 
 __all__ = [
     "GPTConfig",
@@ -62,6 +73,9 @@
     "Llama2Config70B",
     "Llama3Config8B",
     "Llama3Config70B",
+    "Llama31Config8B",
+    "Llama31Config70B",
+    "Llama31Config405B",
     "NemotronConfig",
     "Nemotron3Config4B",
     "Nemotron3Config8B",
@@ -87,6 +101,12 @@
     "ChatGLM2Config6B",
     "ChatGLM3Config6B",
     "ChatGLMModel",
+    "Qwen2Config",
+    "Qwen2Config500M",
+    "Qwen2Config1P5B",
+    "Qwen2Config7B",
+    "Qwen2Config72B",
+    "Qwen2Model",
     "MaskedTokenLossReduction",
     "gpt_data_step",
     "gpt_forward_step",

diff --git a/nemo/collections/llm/gpt/model/base.py b/nemo/collections/llm/gpt/model/base.py
@@ -13,6 +13,7 @@
 from nemo.lightning import get_vocab_size, io
 from nemo.lightning.megatron_parallel import MaskedTokenLossReduction
 from nemo.lightning.pytorch.optim import MegatronOptimizerModule, OptimizerModule
+from nemo.utils import logging
 
 HAVE_TE = True
 try:
@@ -131,10 +132,19 @@ def configure_model(self, tokenizer) -> "MCoreGPTModel":
         if not isinstance(transformer_layer_spec, ModuleSpec):
             transformer_layer_spec = transformer_layer_spec(self)
 
+        if hasattr(self, 'vocab_size'):
+            vocab_size = self.vocab_size
+            logging.info(
+                f"Use preset vocab_size: {vocab_size}, original vocab_size: {tokenizer.vocab_size}, dummy tokens:"
+                f" {vocab_size - tokenizer.vocab_size}."
+            )
+        else:
+            vocab_size = get_vocab_size(self, tokenizer.vocab_size, self.make_vocab_size_divisible_by)
+
         return MCoreGPTModel(
             self,
             transformer_layer_spec=transformer_layer_spec,
-            vocab_size=get_vocab_size(self, tokenizer.vocab_size, self.make_vocab_size_divisible_by),
+            vocab_size=vocab_size,
             max_sequence_length=self.seq_length,
             fp16_lm_cross_entropy=self.fp16_lm_cross_entropy,
             parallel_output=self.parallel_output,